From 8f9856e585616a460b5f31160f867f70c4a80c69 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Thu, 21 Nov 2024 16:29:38 -0500
Subject: [PATCH 01/17] Start with rna-seq annotated filename functions

---
 src/encoded/commands/create_annotated_filenames.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
index 3bda783d7..675a2d83a 100644
--- a/src/encoded/commands/create_annotated_filenames.py
+++ b/src/encoded/commands/create_annotated_filenames.py
@@ -914,6 +914,11 @@ def get_chain_file_value(file: Dict[str, Any]) -> str:
     return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly])
 
 
+def get_rna_seq_tsv_value(file: Dict[str, Any]) -> str:
+    """Get isoform or gene from description and gencode version for RNA-seq tsv and bam files."""
+    # Use description and file format to determine with value
+    
+
 def get_file_extension(
     file: Dict[str, Any], file_format: Dict[str, Any]
 ) -> FilenamePart:

From 7fa4fab7165310e80703b6cc0636f4960383ced0 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Tue, 26 Nov 2024 13:55:02 -0500
Subject: [PATCH 02/17] Add gene_annotation and update rnaseq filenames

---
 .../commands/create_annotated_filenames.py    | 41 ++++++++++--
 src/encoded/item_utils/file.py                | 12 +++-
 src/encoded/item_utils/file_format.py         |  6 +-
 src/encoded/project/loadxl.py                 |  1 +
 src/encoded/schemas/mixins.json               |  8 +++
 src/encoded/schemas/output_file.json          |  3 +
 src/encoded/schemas/supplementary_file.json   |  3 +
 src/encoded/tests/test_annotated_filename.py  | 65 ++++++++++++++++---
 8 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
index 675a2d83a..60fa5ff03 100644
--- a/src/encoded/commands/create_annotated_filenames.py
+++ b/src/encoded/commands/create_annotated_filenames.py
@@ -84,6 +84,7 @@ class AssociatedItems:
     sequencing_center: Dict[str, Any]
     software: List[Dict[str, Any]]
     reference_genome: Dict[str, Any]
+    gene_annotation: Dict[str, Any]
     file_sets: List[Dict[str, Any]]
     donor_specific_assembly: Dict[str, Any]
     assays: List[Dict[str, Any]]
@@ -110,6 +111,7 @@ def get_associated_items(
     file_format = get_file_format(file, request_handler)
     software = get_software(file, request_handler)
     reference_genome = get_reference_genome(file, request_handler)
+    gene_annotation = get_gene_annotation(file, request_handler)
     donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
     if donor_specific_assembly:
         file_sets=get_derived_from_file_sets(file, request_handler)
@@ -130,6 +132,7 @@ def get_associated_items(
         file_format=file_format,
         software=software,
         reference_genome=reference_genome,
+        gene_annotation=gene_annotation,
         file_sets=file_sets,
         donor_specific_assembly=donor_specific_assembly,
         assays=assays,
@@ -209,6 +212,13 @@ def get_reference_genome(
     return get_item(file_utils.get_reference_genome(file), request_handler)
 
 
+def get_gene_annotation(
+    file: Dict[str, Any], request_handler: RequestHandler
+) -> Dict[str, Any]:
+    """Get gene annotation for file."""
+    return get_item(file_utils.get_gene_annotation(file), request_handler)
+
+
 def get_software(
     file: Dict[str, Any], request_handler: RequestHandler
 ) -> List[Dict[str, Any]]:
@@ -427,7 +437,11 @@ def get_annotated_filename(
     accession = get_accession(file)
     file_extension = get_file_extension(file, associated_items.file_format)
     analysis_info = get_analysis(
-        file, associated_items.software, associated_items.reference_genome,associated_items.file_format
+        file,
+        associated_items.software,
+        associated_items.reference_genome,
+        associated_items.gene_annotation,
+        associated_items.file_format
     )
     errors = collect_errors(
         project_id,
@@ -800,6 +814,7 @@ def get_analysis(
     file: Dict[str, Any],
     software: List[Dict[str, Any]],
     reference_genome: Dict[str, Any],
+    gene_annotation: Dict[str, Any],
     file_extension: Dict[str, Any],
 ) -> FilenamePart:
     """Get analysis info for file.
@@ -809,14 +824,19 @@ def get_analysis(
     """
     software_and_versions = get_software_and_versions(software)
     reference_genome_code = item_utils.get_code(reference_genome)
+    gene_annotation_code = item_utils.get_code(gene_annotation)
     errors = get_analysis_errors(file, reference_genome_code)
     if errors:
         return get_filename_part(errors=errors)
     value = get_analysis_value(
-        software_and_versions, reference_genome_code
+        software_and_versions,
+        reference_genome_code,
+        gene_annotation_code
     )
     if file_format_utils.is_chain_file(file_extension):
         value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}"
+    elif file_format_utils.is_tsv_file(file_extension) and "RNA Quantification" in file_utils.get_data_category(file):
+        value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_rna_seq_tsv_value(file)}"
     if not value:
         if file_utils.is_unaligned_reads(file):  # Think this is the only case (?)
             return get_filename_part(value=DEFAULT_ABSENT_FIELD)
@@ -842,12 +862,14 @@ def get_analysis_errors(
 
 
 def get_analysis_value(
-    software_and_versions: str, reference_genome_code: str
+    software_and_versions: str,
+    reference_genome_code: str,
+    gene_annotation_code: str
 ) -> str:
     """Get analysis value for filename."""
     to_write = [
         string
-        for string in [software_and_versions, reference_genome_code]
+        for string in [software_and_versions, reference_genome_code, gene_annotation_code]
         if string
     ]
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
@@ -915,9 +937,14 @@ def get_chain_file_value(file: Dict[str, Any]) -> str:
 
 
 def get_rna_seq_tsv_value(file: Dict[str, Any]) -> str:
-    """Get isoform or gene from description and gencode version for RNA-seq tsv and bam files."""
-    # Use description and file format to determine with value
-    
+    """Get isoform or gene from data type RNA-seq tsv files."""
+    if "Gene Expression" in file_utils.get_data_type(file):
+        return "gene"
+    elif "Transcript Expression" in file_utils.get_data_type(file):
+        return "isoform"
+    else:
+        return ""
+
 
 def get_file_extension(
     file: Dict[str, Any], file_format: Dict[str, Any]
diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py
index 595bae82f..22e29144c 100644
--- a/src/encoded/item_utils/file.py
+++ b/src/encoded/item_utils/file.py
@@ -76,6 +76,11 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any
     return properties.get("reference_genome", "")
 
 
+def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
+    """Get gene annotation from properties."""
+    return properties.get("gene_annotation", "")
+
+
 def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]:
     """Get file sets from properties."""
     return properties.get("file_sets", [])
@@ -425,4 +430,9 @@ def get_associated_files_status(
 
 def get_override_group_coverage(file: Dict[str, Any]) -> str:
     """Get override group coverage from properties."""
-    return file.get("override_group_coverage","")
\ No newline at end of file
+    return file.get("override_group_coverage","")
+
+
+def is_rsem_tsv(properties: Dict[str, Any], request_handler: RequestHandler) -> bool:
+    """Check if file is an RSEM tsv output file."""
+    return get_file_extension(request_handler,properties) == "tsv" and "RNA Quantification" in get_data_category(properties)
\ No newline at end of file
diff --git a/src/encoded/item_utils/file_format.py b/src/encoded/item_utils/file_format.py
index bb1679e1e..8583c6180 100644
--- a/src/encoded/item_utils/file_format.py
+++ b/src/encoded/item_utils/file_format.py
@@ -10,4 +10,8 @@ def get_other_allowed_extensions(properties: Dict[str, Any]) -> str:
 
 
 def is_chain_file(properties: Dict[str, Any]) -> bool:
-    return get_standard_file_extension(properties) == "chain.gz"
\ No newline at end of file
+    return get_standard_file_extension(properties) in ["chain.gz","chain"]
+
+
+def is_tsv_file(properties: Dict[str, Any]) -> bool:
+    return get_standard_file_extension(properties) == "tsv"
diff --git a/src/encoded/project/loadxl.py b/src/encoded/project/loadxl.py
index 8274c7b9b..474bbf752 100644
--- a/src/encoded/project/loadxl.py
+++ b/src/encoded/project/loadxl.py
@@ -11,6 +11,7 @@ class SMaHTProjectLoadxl(SnovaultProjectLoadxl):
         "file_format",
         "quality_metric",
         "reference_genome",
+        "gene_annotation",
         "software",
         "tracking_item",
         "image",
diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json
index 140e44342..f28860b8a 100644
--- a/src/encoded/schemas/mixins.json
+++ b/src/encoded/schemas/mixins.json
@@ -364,6 +364,14 @@
             "minimum": 1
         }
     },
+    "gene_annotation": {
+        "gene_annotation": {
+            "title": "Gene Annotation",
+            "description": "Gene annotation used for gene or transcript quantification",
+            "type": "string",
+            "linkTo": "GeneAnnotation"
+        }
+    },
     "identifier": {
         "identifier": {
             "title": "Identifier",
diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json
index 7b422360e..23526f66b 100644
--- a/src/encoded/schemas/output_file.json
+++ b/src/encoded/schemas/output_file.json
@@ -49,6 +49,9 @@
         {
             "$ref": "mixins.json#/file_release"
         },
+        {
+            "$ref": "mixins.json#/gene_annotation"
+        },
         {
             "$ref": "mixins.json#/modified"
         },
diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json
index a8bce5edf..4528972c8 100644
--- a/src/encoded/schemas/supplementary_file.json
+++ b/src/encoded/schemas/supplementary_file.json
@@ -35,6 +35,9 @@
         {
             "$ref": "mixins.json#/file_release"
         },
+        {
+            "$ref": "mixins.json#/gene_annotation"
+        },
         {
             "$ref": "mixins.json#/modified"
         },
diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py
index 5b3176134..ccaca2878 100644
--- a/src/encoded/tests/test_annotated_filename.py
+++ b/src/encoded/tests/test_annotated_filename.py
@@ -527,8 +527,10 @@ def test_get_sequencing_center_code(
 ANOTHER_SOFTWARE = {"code": ANOTHER_SOFTWARE_CODE, "version": ANOTHER_SOFTWARE_VERSION}
 REFERENCE_GENOME_CODE = "GRCh38"
 TARGET_GENOME_CODE = "HELA_DSA"
+GENE_ANNOTATION_CODE = "gencode45"
 
 SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE}
+SOME_GENE_ANNOTATION = {"code": GENE_ANNOTATION_CODE}
 SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]}
 SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]}
 SOME_CHAIN_FILE = {
@@ -536,6 +538,14 @@ def test_get_sequencing_center_code(
     "source_assembly": REFERENCE_GENOME_CODE,
     "target_assembly": TARGET_GENOME_CODE
 }
+SOME_TSV_FILE = {
+    "data_type": ["Gene Expression"],
+    "data_category": ["RNA Quantification"]
+}
+SOME_ISOFORM_TSV_FILE = {
+    "data_type": ["Transcript Expression"],
+    "data_category": ["RNA Quantification"]
+}
 SOME_SOMATIC_VARIANT_CALLS = {"data_category": ["Somatic Variant Calls"]}
 SOME_VARIANT_CALLS = {
     "data_category": ["Somatic Variant Calls"],
@@ -556,37 +566,45 @@ def test_get_sequencing_center_code(
     "standard_file_extension": "chain.gz",
     "valid_item_types": ["SupplementaryFile"]
 }
+TSV_FILE_EXTENSION = {
+    "identifier": "TSV",
+    "standard_file_extension": "tsv",
+    "valid_item_types": ["SupplementaryFile", "OutputFile"]
+}
 
 
 @pytest.mark.parametrize(
-    "file,software,reference_genome,file_extension,expected,errors",
+    "file,software,reference_genome,gene_annotation,file_extension,expected,errors",
     [
-        ({}, [], {}, {},"" , True),
-        (SOME_UNALIGNED_READS, [], {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False),
+        ({}, [], {}, {}, {},"" , True),
+        (SOME_UNALIGNED_READS, [], {}, {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False),
         (
             SOME_UNALIGNED_READS,
             [SOME_SOFTWARE],
             {},
+            {},
             SOME_FILE_EXTENSION,
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}",
             False,
         ),
-        (SOME_UNALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, SOME_FILE_EXTENSION, "", True),
-        (SOME_ALIGNED_READS, [], {}, {},"", True),
-        (SOME_ALIGNED_READS, [SOME_SOFTWARE], {}, SOME_FILE_EXTENSION, "", True),
+        (SOME_UNALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, "", True),
+        (SOME_ALIGNED_READS, [], {}, {}, {},"", True),
+        (SOME_ALIGNED_READS, [SOME_SOFTWARE], {}, {}, SOME_FILE_EXTENSION, "", True),
         (
             SOME_ALIGNED_READS,
             [SOME_SOFTWARE],
             SOME_REFERENCE_GENOME,
+            {},
             SOME_FILE_EXTENSION,
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}",
             False,
         ),
-        (SOME_SOMATIC_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, VCF_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False),
+        (SOME_SOMATIC_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, VCF_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False),
         (
             SOME_VARIANT_CALLS,
             [SOME_SOFTWARE],
             SOME_REFERENCE_GENOME,
+            {},
             VCF_FILE_EXTENSION,
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}",
             False,
@@ -595,6 +613,7 @@ def test_get_sequencing_center_code(
             SOME_ALIGNED_READS,
             [SOME_SOFTWARE, ANOTHER_SOFTWARE],
             SOME_REFERENCE_GENOME,
+            {},
             SOME_FILE_EXTENSION,
             f"{ANOTHER_SOFTWARE_CODE}_{ANOTHER_SOFTWARE_VERSION}_{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}",
             False,
@@ -603,6 +622,7 @@ def test_get_sequencing_center_code(
             SOME_ALIGNED_READS,
             [SOME_SOFTWARE, SOME_ITEM],
             SOME_REFERENCE_GENOME,
+            {},
             SOME_FILE_EXTENSION,
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}",
             False,
@@ -611,22 +631,51 @@ def test_get_sequencing_center_code(
             SOME_CHAIN_FILE,
             [SOME_SOFTWARE, SOME_ITEM],
             {},
+            {},
             CHAIN_FILE_EXTENSION,
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}To{TARGET_GENOME_CODE}",
             False,
         ),
+        (
+            SOME_TSV_FILE,
+            [SOME_SOFTWARE],
+            SOME_REFERENCE_GENOME,
+            SOME_GENE_ANNOTATION,
+            TSV_FILE_EXTENSION,
+            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_gene",
+            False
+        ),
+                (
+            SOME_ISOFORM_TSV_FILE,
+            [SOME_SOFTWARE],
+            SOME_REFERENCE_GENOME,
+            SOME_GENE_ANNOTATION,
+            TSV_FILE_EXTENSION,
+            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_isoform",
+            False
+        ),
+        (
+            SOME_ALIGNED_READS,
+            [SOME_SOFTWARE],
+            SOME_REFERENCE_GENOME,
+            SOME_GENE_ANNOTATION,
+            SOME_FILE_EXTENSION,
+            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}",
+            False
+        )
     ],
 )
 def test_get_analysis(
     file: Dict[str, Any],
     software: List[Dict[str, Any]],
     reference_genome: Dict[str, Any],
+    gene_annotation: Dict[str, Any],
     file_extension: Dict[str, Any],
     expected: str,
     errors: bool,
 ) -> None:
     """Test analysis info retrieval for annotated filenames."""
-    result = get_analysis(file, software, reference_genome, file_extension)
+    result = get_analysis(file, software, reference_genome, gene_annotation, file_extension)
     assert_filename_part_matches(result, expected, errors)
 
 

From a8a2208f714df6527e316ad3e51ddb03eb8e3747 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Tue, 26 Nov 2024 15:16:29 -0500
Subject: [PATCH 03/17] Fix tests

---
 src/encoded/tests/data/workbook-inserts/output_file.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json
index cd93cf536..b04c8f60e 100644
--- a/src/encoded/tests/data/workbook-inserts/output_file.json
+++ b/src/encoded/tests/data/workbook-inserts/output_file.json
@@ -70,6 +70,7 @@
             "foo:software_vep"
         ],
         "reference_genome": "GRCh38",
+        "gene_annotation": "gencode45",
         "status": "released",
         "dataset": "colo829t",
         "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38.aligned.sorted.phased.bam"

From dec92ca5d07b09d742acfbe96e868fc4c20324d1 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Tue, 26 Nov 2024 15:22:05 -0500
Subject: [PATCH 04/17] commit new files

---
 src/encoded/schemas/gene_annotation.json      | 113 ++++++++++++++++++
 .../workbook-inserts/gene_annotation.json     |  10 ++
 src/encoded/types/gene_annotation.py          |  21 ++++
 3 files changed, 144 insertions(+)
 create mode 100644 src/encoded/schemas/gene_annotation.json
 create mode 100644 src/encoded/tests/data/workbook-inserts/gene_annotation.json
 create mode 100644 src/encoded/types/gene_annotation.py

diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json
new file mode 100644
index 000000000..c9b404078
--- /dev/null
+++ b/src/encoded/schemas/gene_annotation.json
@@ -0,0 +1,113 @@
+{
+    "title": "Gene Annotation",
+    "$id": "/profiles/gene_annotation.json",
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "required": [
+        "identifier",
+        "title"
+    ],
+    "anyOf": [
+        {
+            "allOf": [
+                {"required": ["submission_centers"]},
+                {"required": ["consortia"]}
+            ]
+        },
+        {
+            "required": [
+                "consortia"
+            ]
+        }
+    ],
+    "identifyingProperties": [
+        "accession",
+        "aliases",
+        "identifier",
+        "uuid"
+    ],
+    "additionalProperties": false,
+    "mixinProperties": [
+        {
+            "$ref": "mixins.json#/accession"
+        },
+        {
+            "$ref": "mixins.json#/aliases"
+        },
+        {
+            "$ref": "mixins.json#/attribution"
+        },
+        {
+            "$ref": "mixins.json#/code"
+        },
+        {
+            "$ref": "mixins.json#/description"
+        },
+        {
+            "$ref": "mixins.json#/identifier"
+        },
+        {
+            "$ref": "mixins.json#/modified"
+        },
+        {
+            "$ref": "mixins.json#/schema_version"
+        },
+        {
+            "$ref": "mixins.json#/status"
+        },
+        {
+            "$ref": "mixins.json#/submitted"
+        },
+        {
+            "$ref": "mixins.json#/tags"
+        },
+        {
+            "$ref": "mixins.json#/title"
+        },
+        {
+            "$ref": "mixins.json#/url"
+        },
+        {
+            "$ref": "mixins.json#/uuid"
+        },
+        {
+            "$ref": "mixins.json#/version"
+        }
+    ],
+    "properties": {
+        "accession": {
+            "accessionType": "GA"
+        },
+        "code": {
+            "pattern": "^[A-Za-z0-9]{3,}$"
+        },
+        "schema_version": {
+            "default": "1"
+        },
+        "version": {
+            "pattern": "^[A-Za-z0-9_-.]{3,}$"
+        },
+        "files": {
+            "title": "Files",
+            "description": "Files associated with the gene annotation",
+            "type": "array",
+            "minItems": 1,
+            "uniqueItems": true,
+            "items": {
+                "type": "string",
+                "linkTo": "File"
+            }
+        }
+    },
+    "columns": {
+        "identifier": {
+            "title": "Identifier"
+        },
+        "code": {
+            "title": "Code"
+        },
+        "title": {
+            "title": "Title"
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/encoded/tests/data/workbook-inserts/gene_annotation.json b/src/encoded/tests/data/workbook-inserts/gene_annotation.json
new file mode 100644
index 000000000..19bb20293
--- /dev/null
+++ b/src/encoded/tests/data/workbook-inserts/gene_annotation.json
@@ -0,0 +1,10 @@
+[
+    {
+        "identifier": "gencode45",
+        "title": "GENCODE v45",
+	"code": "gencode45",
+        "consortia": [
+            "smaht"
+        ]
+    }
+]
diff --git a/src/encoded/types/gene_annotation.py b/src/encoded/types/gene_annotation.py
new file mode 100644
index 000000000..961f6e5b5
--- /dev/null
+++ b/src/encoded/types/gene_annotation.py
@@ -0,0 +1,21 @@
+from snovault import collection, load_schema
+
+from .base import Item
+
+def _build_gene_annotation_embedded_list():
+    """Embeds for search on gene annotations."""
+    return []
+
+
+@collection(
+    name="gene-annotations",
+    unique_key="gene_annotation:identifier",
+    properties={
+        "title": "Gene Annotations",
+        "description": "Gene annotations for gene and transcript quantification",
+    },
+)
+class GeneAnnotation(Item):
+    item_type = "gene_annotation"
+    schema = load_schema("encoded:schemas/gene_annotation.json")
+    embedded_list = _build_gene_annotation_embedded_list()

From cb10cb300b06ae51f4f2542a124ca388522a0eb7 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Tue, 26 Nov 2024 15:37:30 -0500
Subject: [PATCH 05/17] update consortia requirement

---
 src/encoded/schemas/gene_annotation.json                     | 5 ++---
 src/encoded/tests/data/workbook-inserts/gene_annotation.json | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json
index c9b404078..b46715c6a 100644
--- a/src/encoded/schemas/gene_annotation.json
+++ b/src/encoded/schemas/gene_annotation.json
@@ -9,9 +9,8 @@
     ],
     "anyOf": [
         {
-            "allOf": [
-                {"required": ["submission_centers"]},
-                {"required": ["consortia"]}
+            "required": [
+                "submission_centers"
             ]
         },
         {
diff --git a/src/encoded/tests/data/workbook-inserts/gene_annotation.json b/src/encoded/tests/data/workbook-inserts/gene_annotation.json
index 19bb20293..79ddcacd3 100644
--- a/src/encoded/tests/data/workbook-inserts/gene_annotation.json
+++ b/src/encoded/tests/data/workbook-inserts/gene_annotation.json
@@ -2,7 +2,7 @@
     {
         "identifier": "gencode45",
         "title": "GENCODE v45",
-	"code": "gencode45",
+        "code": "gencode45",
         "consortia": [
             "smaht"
         ]

From 5cf2fea00d1b1090e06b777fd2f265a400f21b69 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Tue, 26 Nov 2024 15:52:13 -0500
Subject: [PATCH 06/17] Fix annotated filename insert

---
 src/encoded/tests/data/workbook-inserts/output_file.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json
index b04c8f60e..cfd6e1e59 100644
--- a/src/encoded/tests/data/workbook-inserts/output_file.json
+++ b/src/encoded/tests/data/workbook-inserts/output_file.json
@@ -73,6 +73,6 @@
         "gene_annotation": "gencode45",
         "status": "released",
         "dataset": "colo829t",
-        "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38.aligned.sorted.phased.bam"
+        "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam"
     }
 ]

From de202b15ea5bf65ffadf184ff374a4e2ddfce83a Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Mon, 2 Dec 2024 13:03:17 -0500
Subject: [PATCH 07/17] remove files from gene annotation

---
 src/encoded/schemas/gene_annotation.json | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json
index b46715c6a..45d6b6d12 100644
--- a/src/encoded/schemas/gene_annotation.json
+++ b/src/encoded/schemas/gene_annotation.json
@@ -85,17 +85,6 @@
         },
         "version": {
             "pattern": "^[A-Za-z0-9_-.]{3,}$"
-        },
-        "files": {
-            "title": "Files",
-            "description": "Files associated with the gene annotation",
-            "type": "array",
-            "minItems": 1,
-            "uniqueItems": true,
-            "items": {
-                "type": "string",
-                "linkTo": "File"
-            }
         }
     },
     "columns": {

From a47704d501d16f436e4aad893827bd153ac83941 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Tue, 3 Dec 2024 12:02:34 -0500
Subject: [PATCH 08/17] Merge up-to-date with main

---
 CHANGELOG.rst  | 8 ++++++++
 pyproject.toml | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c5d81b907..0b8468651 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,14 @@ smaht-portal
 Change Log
 ----------
 
+0.116.0
+=======
+`PR 299 SN RNA-seq filenames </~https://github.com/smaht-dac/smaht-portal/pull/299>`
+
+* Create new item GeneAnnotation that OutputFile and SupplementaryFile link to with property `gene_annotation`
+* Update `commands/create_annotated_filenames.py` to include gencode version and gene/isoform information for RSEM tsv output files and RNA-seq aligned bams
+
+
 0.115.0
 =======
 `PR 296 SN Sequencing validation </~https://github.com/smaht-dac/smaht-portal/pull/296>`
diff --git a/pyproject.toml b/pyproject.toml
index 5b006392c..74b28f921 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "encoded"
-version = "0.115.0"
+version = "0.116.0"
 description = "SMaHT Data Analysis Portal"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"

From 393b526896d960bb083ed4028489407debf8bf37 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Wed, 4 Dec 2024 14:19:27 -0500
Subject: [PATCH 09/17] Add error message for no transcript info

---
 .../commands/create_annotated_filenames.py    | 50 +++++++++++++------
 src/encoded/item_utils/file.py                |  5 --
 src/encoded/tests/test_annotated_filename.py  | 24 ++++++++-
 3 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
index 60fa5ff03..f0eecc3e1 100644
--- a/src/encoded/commands/create_annotated_filenames.py
+++ b/src/encoded/commands/create_annotated_filenames.py
@@ -33,6 +33,9 @@
 ANALYSIS_INFO_SEPARATOR = "_"
 CHAIN_FILE_INFO_SEPARATOR = "To"
 
+RNA_DATA_CATEGORY = "RNA Quantification"
+GENE_DATA_TYPE = "Gene Expression"
+ISOFORM_DATA_TYPE = "Transcript Expression"
 
 DEFAULT_PROJECT_ID = constants.PRODUCTION_PREFIX
 DEFAULT_ABSENT_FIELD = "X"
@@ -825,18 +828,24 @@ def get_analysis(
     software_and_versions = get_software_and_versions(software)
     reference_genome_code = item_utils.get_code(reference_genome)
     gene_annotation_code = item_utils.get_code(gene_annotation)
-    errors = get_analysis_errors(file, reference_genome_code)
-    if errors:
-        return get_filename_part(errors=errors)
+    transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
     value = get_analysis_value(
         software_and_versions,
         reference_genome_code,
-        gene_annotation_code
+        gene_annotation_code,
+        transcript_info_code
     )
     if file_format_utils.is_chain_file(file_extension):
         value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}"
-    elif file_format_utils.is_tsv_file(file_extension) and "RNA Quantification" in file_utils.get_data_category(file):
-        value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_rna_seq_tsv_value(file)}"
+    errors = get_analysis_errors(
+        file,
+        reference_genome_code,
+        gene_annotation_code,
+        transcript_info_code,
+        file_extension,
+    )
+    if errors:
+        return get_filename_part(errors=errors)
     if not value:
         if file_utils.is_unaligned_reads(file):  # Think this is the only case (?)
             return get_filename_part(value=DEFAULT_ABSENT_FIELD)
@@ -845,7 +854,11 @@ def get_analysis(
 
 
 def get_analysis_errors(
-    file: Dict[str, Any], reference_genome_code: str
+    file: Dict[str, Any], 
+    reference_genome_code: str,
+    gene_annotation_code: str,
+    transcript_info_code:  str,
+    file_extension: Dict[str, Any]
 ) -> List[str]:
     """Get analysis errors for file by file type."""
     errors = []
@@ -858,18 +871,24 @@ def get_analysis_errors(
     if file_utils.is_variant_calls(file):
         if not reference_genome_code:
             errors.append("No reference genome code found")
+    if RNA_DATA_CATEGORY in file_utils.get_data_category(file):
+        if not gene_annotation_code:
+            errors.append("No gene annotation code found")
+        elif file_format_utils.is_tsv_file(file_extension) and not transcript_info_code:
+            errors.append("No gene or isoform code found")
     return errors
 
 
 def get_analysis_value(
     software_and_versions: str,
     reference_genome_code: str,
-    gene_annotation_code: str
+    gene_annotation_code: str,
+    transcript_info_code: str
 ) -> str:
     """Get analysis value for filename."""
     to_write = [
         string
-        for string in [software_and_versions, reference_genome_code, gene_annotation_code]
+        for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code]
         if string
     ]
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
@@ -936,12 +955,13 @@ def get_chain_file_value(file: Dict[str, Any]) -> str:
     return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly])
 
 
-def get_rna_seq_tsv_value(file: Dict[str, Any]) -> str:
-    """Get isoform or gene from data type RNA-seq tsv files."""
-    if "Gene Expression" in file_utils.get_data_type(file):
-        return "gene"
-    elif "Transcript Expression" in file_utils.get_data_type(file):
-        return "isoform"
+def get_rna_seq_tsv_value(file: Dict[str, Any], file_extension: Dict[str, Any]) -> str:
+    """Get isoform or gene from data type for RNA-seq tsv files."""
+    if file_format_utils.is_tsv_file(file_extension) and RNA_DATA_CATEGORY in file_utils.get_data_category(file):
+        if GENE_DATA_TYPE in file_utils.get_data_type(file):
+            return "gene"
+        elif ISOFORM_DATA_TYPE in file_utils.get_data_type(file):
+            return "isoform"
     else:
         return ""
 
diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py
index 22e29144c..e40d3cc22 100644
--- a/src/encoded/item_utils/file.py
+++ b/src/encoded/item_utils/file.py
@@ -431,8 +431,3 @@ def get_associated_files_status(
 def get_override_group_coverage(file: Dict[str, Any]) -> str:
     """Get override group coverage from properties."""
     return file.get("override_group_coverage","")
-
-
-def is_rsem_tsv(properties: Dict[str, Any], request_handler: RequestHandler) -> bool:
-    """Check if file is an RSEM tsv output file."""
-    return get_file_extension(request_handler,properties) == "tsv" and "RNA Quantification" in get_data_category(properties)
\ No newline at end of file
diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py
index ccaca2878..c2f7b2d63 100644
--- a/src/encoded/tests/test_annotated_filename.py
+++ b/src/encoded/tests/test_annotated_filename.py
@@ -533,6 +533,7 @@ def test_get_sequencing_center_code(
 SOME_GENE_ANNOTATION = {"code": GENE_ANNOTATION_CODE}
 SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]}
 SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]}
+RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]}
 SOME_CHAIN_FILE = {
     "data_type": ["SupplementaryFile"],
     "source_assembly": REFERENCE_GENOME_CODE,
@@ -542,6 +543,9 @@ def test_get_sequencing_center_code(
     "data_type": ["Gene Expression"],
     "data_category": ["RNA Quantification"]
 }
+SOME_OTHER_FILE = {
+    "data_category": ["RNA Quantification"]
+}
 SOME_ISOFORM_TSV_FILE = {
     "data_type": ["Transcript Expression"],
     "data_category": ["RNA Quantification"]
@@ -645,7 +649,7 @@ def test_get_sequencing_center_code(
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_gene",
             False
         ),
-                (
+        (
             SOME_ISOFORM_TSV_FILE,
             [SOME_SOFTWARE],
             SOME_REFERENCE_GENOME,
@@ -654,6 +658,15 @@ def test_get_sequencing_center_code(
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_isoform",
             False
         ),
+        (
+            SOME_OTHER_FILE,
+            [SOME_SOFTWARE],
+            SOME_REFERENCE_GENOME,
+            SOME_GENE_ANNOTATION,
+            TSV_FILE_EXTENSION,
+            "",
+            True
+        ),
         (
             SOME_ALIGNED_READS,
             [SOME_SOFTWARE],
@@ -662,6 +675,15 @@ def test_get_sequencing_center_code(
             SOME_FILE_EXTENSION,
             f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}",
             False
+        ),
+        (
+            RNA_ALIGNED_READS,
+            [SOME_SOFTWARE],
+            SOME_REFERENCE_GENOME,
+            {},
+            SOME_FILE_EXTENSION,
+            "",
+            True
         )
     ],
 )

From c275ce45b69d377254e3c18322b1dbc9a7ec0e69 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Thu, 5 Dec 2024 13:04:46 -0500
Subject: [PATCH 10/17] Make annotation a reference file link

---
 .../commands/create_annotated_filenames.py    |  38 +++++--
 src/encoded/item_utils/file.py                |   6 +-
 src/encoded/project/loadxl.py                 |   1 -
 src/encoded/schemas/gene_annotation.json      | 101 ------------------
 src/encoded/schemas/mixins.json               |  19 ++--
 src/encoded/schemas/output_file.json          |   2 +-
 src/encoded/schemas/reference_file.json       |   3 +
 src/encoded/schemas/supplementary_file.json   |   2 +-
 .../data/workbook-inserts/file_format.json    |  11 ++
 .../workbook-inserts/gene_annotation.json     |  10 --
 .../data/workbook-inserts/output_file.json    |   4 +-
 .../data/workbook-inserts/reference_file.json |  17 +++
 src/encoded/tests/test_annotated_filename.py  |   8 +-
 .../tests/test_metadata_tsv_workbook.py       |   4 +-
 src/encoded/types/gene_annotation.py          |  21 ----
 15 files changed, 85 insertions(+), 162 deletions(-)
 delete mode 100644 src/encoded/schemas/gene_annotation.json
 delete mode 100644 src/encoded/tests/data/workbook-inserts/gene_annotation.json
 delete mode 100644 src/encoded/types/gene_annotation.py

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
index f0eecc3e1..20277e337 100644
--- a/src/encoded/commands/create_annotated_filenames.py
+++ b/src/encoded/commands/create_annotated_filenames.py
@@ -87,7 +87,7 @@ class AssociatedItems:
     sequencing_center: Dict[str, Any]
     software: List[Dict[str, Any]]
     reference_genome: Dict[str, Any]
-    gene_annotation: Dict[str, Any]
+    gene_annotations: Dict[str, Any]
     file_sets: List[Dict[str, Any]]
     donor_specific_assembly: Dict[str, Any]
     assays: List[Dict[str, Any]]
@@ -114,7 +114,7 @@ def get_associated_items(
     file_format = get_file_format(file, request_handler)
     software = get_software(file, request_handler)
     reference_genome = get_reference_genome(file, request_handler)
-    gene_annotation = get_gene_annotation(file, request_handler)
+    gene_annotations = get_gene_annotations(file, request_handler)
     donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
     if donor_specific_assembly:
         file_sets=get_derived_from_file_sets(file, request_handler)
@@ -135,7 +135,7 @@ def get_associated_items(
         file_format=file_format,
         software=software,
         reference_genome=reference_genome,
-        gene_annotation=gene_annotation,
+        gene_annotations=gene_annotations,
         file_sets=file_sets,
         donor_specific_assembly=donor_specific_assembly,
         assays=assays,
@@ -215,11 +215,11 @@ def get_reference_genome(
     return get_item(file_utils.get_reference_genome(file), request_handler)
 
 
-def get_gene_annotation(
+def get_gene_annotations(
     file: Dict[str, Any], request_handler: RequestHandler
 ) -> Dict[str, Any]:
-    """Get gene annotation for file."""
-    return get_item(file_utils.get_gene_annotation(file), request_handler)
+    """Get gene annotations for file."""
+    return get_items(file_utils.get_annotation(file), request_handler)
 
 
 def get_software(
@@ -443,7 +443,7 @@ def get_annotated_filename(
         file,
         associated_items.software,
         associated_items.reference_genome,
-        associated_items.gene_annotation,
+        associated_items.gene_annotations,
         associated_items.file_format
     )
     errors = collect_errors(
@@ -817,7 +817,7 @@ def get_analysis(
     file: Dict[str, Any],
     software: List[Dict[str, Any]],
     reference_genome: Dict[str, Any],
-    gene_annotation: Dict[str, Any],
+    gene_annotations: Dict[str, Any],
     file_extension: Dict[str, Any],
 ) -> FilenamePart:
     """Get analysis info for file.
@@ -827,7 +827,7 @@ def get_analysis(
     """
     software_and_versions = get_software_and_versions(software)
     reference_genome_code = item_utils.get_code(reference_genome)
-    gene_annotation_code = item_utils.get_code(gene_annotation)
+    gene_annotation_code = get_gene_annotation_codes(gene_annotations)
     transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
     value = get_analysis_value(
         software_and_versions,
@@ -894,6 +894,26 @@ def get_analysis_value(
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
 
 
+def get_gene_annotation_codes(gene_annotations: List[Dict[str, Any]]) -> str:
+    """Get gene annotation codes for file.
+    """
+    codes = [item for item in gene_annotations if item_utils.get_code(item)]
+    if not codes:
+        return ""
+    return get_gene_annotation_codes_string(codes)
+
+
+def get_gene_annotation_codes_string(annotation_items: List[Dict[str, Any]]) -> str:
+    """Get string representation of gene annotation codes."""
+    sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code)
+    return ANALYSIS_INFO_SEPARATOR.join(
+        [
+            item_utils.get_code(item)
+            for item in sorted_annotation_items
+        ]
+    )
+
+
 def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
     """Get software and accompanying versions for file.
 
diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py
index e40d3cc22..777ec1747 100644
--- a/src/encoded/item_utils/file.py
+++ b/src/encoded/item_utils/file.py
@@ -76,9 +76,9 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any
     return properties.get("reference_genome", "")
 
 
-def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
-    """Get gene annotation from properties."""
-    return properties.get("gene_annotation", "")
+def get_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
+    """Get annotation from properties."""
+    return properties.get("annotation", [])
 
 
 def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]:
diff --git a/src/encoded/project/loadxl.py b/src/encoded/project/loadxl.py
index 474bbf752..8274c7b9b 100644
--- a/src/encoded/project/loadxl.py
+++ b/src/encoded/project/loadxl.py
@@ -11,7 +11,6 @@ class SMaHTProjectLoadxl(SnovaultProjectLoadxl):
         "file_format",
         "quality_metric",
         "reference_genome",
-        "gene_annotation",
         "software",
         "tracking_item",
         "image",
diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json
deleted file mode 100644
index 45d6b6d12..000000000
--- a/src/encoded/schemas/gene_annotation.json
+++ /dev/null
@@ -1,101 +0,0 @@
-{
-    "title": "Gene Annotation",
-    "$id": "/profiles/gene_annotation.json",
-    "$schema": "https://json-schema.org/draft/2020-12/schema",
-    "type": "object",
-    "required": [
-        "identifier",
-        "title"
-    ],
-    "anyOf": [
-        {
-            "required": [
-                "submission_centers"
-            ]
-        },
-        {
-            "required": [
-                "consortia"
-            ]
-        }
-    ],
-    "identifyingProperties": [
-        "accession",
-        "aliases",
-        "identifier",
-        "uuid"
-    ],
-    "additionalProperties": false,
-    "mixinProperties": [
-        {
-            "$ref": "mixins.json#/accession"
-        },
-        {
-            "$ref": "mixins.json#/aliases"
-        },
-        {
-            "$ref": "mixins.json#/attribution"
-        },
-        {
-            "$ref": "mixins.json#/code"
-        },
-        {
-            "$ref": "mixins.json#/description"
-        },
-        {
-            "$ref": "mixins.json#/identifier"
-        },
-        {
-            "$ref": "mixins.json#/modified"
-        },
-        {
-            "$ref": "mixins.json#/schema_version"
-        },
-        {
-            "$ref": "mixins.json#/status"
-        },
-        {
-            "$ref": "mixins.json#/submitted"
-        },
-        {
-            "$ref": "mixins.json#/tags"
-        },
-        {
-            "$ref": "mixins.json#/title"
-        },
-        {
-            "$ref": "mixins.json#/url"
-        },
-        {
-            "$ref": "mixins.json#/uuid"
-        },
-        {
-            "$ref": "mixins.json#/version"
-        }
-    ],
-    "properties": {
-        "accession": {
-            "accessionType": "GA"
-        },
-        "code": {
-            "pattern": "^[A-Za-z0-9]{3,}$"
-        },
-        "schema_version": {
-            "default": "1"
-        },
-        "version": {
-            "pattern": "^[A-Za-z0-9_-.]{3,}$"
-        }
-    },
-    "columns": {
-        "identifier": {
-            "title": "Identifier"
-        },
-        "code": {
-            "title": "Code"
-        },
-        "title": {
-            "title": "Title"
-        }
-    }
-}
\ No newline at end of file
diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json
index f28860b8a..4d74dab42 100644
--- a/src/encoded/schemas/mixins.json
+++ b/src/encoded/schemas/mixins.json
@@ -74,6 +74,17 @@
             }
         }
     },
+    "annotation": {
+        "annotation": {
+            "title": "Gene Annotation",
+            "description": "Gene annotation used for gene or transcript quantification",
+            "type": "array",
+            "items": {
+                "type": "string",
+                "linkTo": "ReferenceFile"
+            }
+        }
+    },
     "attachment": {
         "attachment": {
             "title": "Attached File",
@@ -364,14 +375,6 @@
             "minimum": 1
         }
     },
-    "gene_annotation": {
-        "gene_annotation": {
-            "title": "Gene Annotation",
-            "description": "Gene annotation used for gene or transcript quantification",
-            "type": "string",
-            "linkTo": "GeneAnnotation"
-        }
-    },
     "identifier": {
         "identifier": {
             "title": "Identifier",
diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json
index 23526f66b..9ef4237a0 100644
--- a/src/encoded/schemas/output_file.json
+++ b/src/encoded/schemas/output_file.json
@@ -50,7 +50,7 @@
             "$ref": "mixins.json#/file_release"
         },
         {
-            "$ref": "mixins.json#/gene_annotation"
+            "$ref": "mixins.json#/annotation"
         },
         {
             "$ref": "mixins.json#/modified"
diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json
index c6c74c99a..55fb22c14 100644
--- a/src/encoded/schemas/reference_file.json
+++ b/src/encoded/schemas/reference_file.json
@@ -37,6 +37,9 @@
         {
             "$ref": "mixins.json#/attribution"
         },
+        {
+            "$ref": "mixins.json#/code"
+        },
         {
             "$ref": "mixins.json#/description"
         },
diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json
index 4528972c8..2d6b04b58 100644
--- a/src/encoded/schemas/supplementary_file.json
+++ b/src/encoded/schemas/supplementary_file.json
@@ -36,7 +36,7 @@
             "$ref": "mixins.json#/file_release"
         },
         {
-            "$ref": "mixins.json#/gene_annotation"
+            "$ref": "mixins.json#/annotation"
         },
         {
             "$ref": "mixins.json#/modified"
diff --git a/src/encoded/tests/data/workbook-inserts/file_format.json b/src/encoded/tests/data/workbook-inserts/file_format.json
index 8c1b40b82..baf9f0bd7 100644
--- a/src/encoded/tests/data/workbook-inserts/file_format.json
+++ b/src/encoded/tests/data/workbook-inserts/file_format.json
@@ -80,5 +80,16 @@
             "ReferenceFile",
             "SupplementaryFile"
         ]
+    },
+    {
+        "uuid": "c3e54d5f-647c-4ca2-9b21-5c01caa3f691",
+        "submission_centers": [
+            "smaht"
+        ],
+        "identifier": "GTF",
+        "standard_file_extension": "gtf",
+        "valid_item_types": [
+            "ReferenceFile"
+        ]
     }
 ]
\ No newline at end of file
diff --git a/src/encoded/tests/data/workbook-inserts/gene_annotation.json b/src/encoded/tests/data/workbook-inserts/gene_annotation.json
deleted file mode 100644
index 79ddcacd3..000000000
--- a/src/encoded/tests/data/workbook-inserts/gene_annotation.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "identifier": "gencode45",
-        "title": "GENCODE v45",
-        "code": "gencode45",
-        "consortia": [
-            "smaht"
-        ]
-    }
-]
diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json
index 8a8d07a6d..1aa9a7af9 100644
--- a/src/encoded/tests/data/workbook-inserts/output_file.json
+++ b/src/encoded/tests/data/workbook-inserts/output_file.json
@@ -70,7 +70,9 @@
             "foo:software_vep"
         ],
         "reference_genome": "GRCh38",
-        "gene_annotation": "gencode45",
+        "annotation": [
+            "smaht:ReferenceFile-collapsed-genes-gencode_v45"
+        ],
         "status": "released",
         "dataset": "colo829t",
         "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam"
diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json
index a0b54987d..f89f0be2a 100644
--- a/src/encoded/tests/data/workbook-inserts/reference_file.json
+++ b/src/encoded/tests/data/workbook-inserts/reference_file.json
@@ -21,5 +21,22 @@
         ],
         "file_size": 8000,
         "status": "restricted"
+    },
+    {
+        "uuid": "8fc6f554-59c9-490a-b6dc-86665a0b971d",
+        "aliases": [
+            "smaht:ReferenceFile-collapsed-genes-gencode_v45"
+        ],
+        "code": "gencode45",
+        "data_type": [
+            "Gene Model"
+        ],
+        "data_category": [
+            "Genome Annotation"
+        ],
+        "file_format": "GTF",
+        "consortia": [
+            "smaht"
+        ]
     }
 ]
\ No newline at end of file
diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py
index c2f7b2d63..a6574a810 100644
--- a/src/encoded/tests/test_annotated_filename.py
+++ b/src/encoded/tests/test_annotated_filename.py
@@ -530,7 +530,7 @@ def test_get_sequencing_center_code(
 GENE_ANNOTATION_CODE = "gencode45"
 
 SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE}
-SOME_GENE_ANNOTATION = {"code": GENE_ANNOTATION_CODE}
+SOME_GENE_ANNOTATION = [{"code": GENE_ANNOTATION_CODE}]
 SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]}
 SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]}
 RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]}
@@ -578,7 +578,7 @@ def test_get_sequencing_center_code(
 
 
 @pytest.mark.parametrize(
-    "file,software,reference_genome,gene_annotation,file_extension,expected,errors",
+    "file,software,reference_genome,annotation,file_extension,expected,errors",
     [
         ({}, [], {}, {}, {},"" , True),
         (SOME_UNALIGNED_READS, [], {}, {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False),
@@ -691,13 +691,13 @@ def test_get_analysis(
     file: Dict[str, Any],
     software: List[Dict[str, Any]],
     reference_genome: Dict[str, Any],
-    gene_annotation: Dict[str, Any],
+    annotation: Dict[str, Any],
     file_extension: Dict[str, Any],
     expected: str,
     errors: bool,
 ) -> None:
     """Test analysis info retrieval for annotated filenames."""
-    result = get_analysis(file, software, reference_genome, gene_annotation, file_extension)
+    result = get_analysis(file, software, reference_genome, annotation, file_extension)
     assert_filename_part_matches(result, expected, errors)
 
 
diff --git a/src/encoded/tests/test_metadata_tsv_workbook.py b/src/encoded/tests/test_metadata_tsv_workbook.py
index ed8d3002f..37cff652d 100644
--- a/src/encoded/tests/test_metadata_tsv_workbook.py
+++ b/src/encoded/tests/test_metadata_tsv_workbook.py
@@ -118,12 +118,12 @@ def test_metadata_tsv_workbook(self, workbook, es_testapp):
         TestMetadataTSVHelper.check_key_and_length(header1, 'Metadata TSV Download')
         TestMetadataTSVHelper.check_key_and_length(header2, 'Suggested command to download: ')
         TestMetadataTSVHelper.check_key_and_length(header3, 'FileDownloadURL')
-        assert len(parsed[3:]) == 19  # there are 19 entries in the workbook right now, including extra files
+        assert len(parsed[3:]) == 20  # there are 20 entries in the workbook right now, including extra files
         # test for various types
         TestMetadataTSVHelper.check_type_length(es_testapp, 'AlignedReads', 3)
         TestMetadataTSVHelper.check_type_length(es_testapp, 'UnalignedReads', 5)
         TestMetadataTSVHelper.check_type_length(es_testapp, 'VariantCalls', 2)
-        TestMetadataTSVHelper.check_type_length(es_testapp, 'ReferenceFile', 1)
+        TestMetadataTSVHelper.check_type_length(es_testapp, 'ReferenceFile', 2)
         TestMetadataTSVHelper.check_type_length(es_testapp, 'OutputFile', 2)
         TestMetadataTSVHelper.check_type_length(es_testapp, 'SupplementaryFile', 2)
 
diff --git a/src/encoded/types/gene_annotation.py b/src/encoded/types/gene_annotation.py
deleted file mode 100644
index 961f6e5b5..000000000
--- a/src/encoded/types/gene_annotation.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from snovault import collection, load_schema
-
-from .base import Item
-
-def _build_gene_annotation_embedded_list():
-    """Embeds for search on gene annotations."""
-    return []
-
-
-@collection(
-    name="gene-annotations",
-    unique_key="gene_annotation:identifier",
-    properties={
-        "title": "Gene Annotations",
-        "description": "Gene annotations for gene and transcript quantification",
-    },
-)
-class GeneAnnotation(Item):
-    item_type = "gene_annotation"
-    schema = load_schema("encoded:schemas/gene_annotation.json")
-    embedded_list = _build_gene_annotation_embedded_list()

From 4a60156059c18f6c5d03404dc0c170245b1b217c Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Thu, 5 Dec 2024 13:26:51 -0500
Subject: [PATCH 11/17] fix merge

---
 src/encoded/schemas/output_file.json        | 4 ----
 src/encoded/schemas/supplementary_file.json | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json
index 9d5174356..9ef4237a0 100644
--- a/src/encoded/schemas/output_file.json
+++ b/src/encoded/schemas/output_file.json
@@ -50,11 +50,7 @@
             "$ref": "mixins.json#/file_release"
         },
         {
-<<<<<<< HEAD
             "$ref": "mixins.json#/annotation"
-=======
-            "$ref": "mixins.json#/gene_annotation"
->>>>>>> main
         },
         {
             "$ref": "mixins.json#/modified"
diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json
index c10bb7c6a..2d6b04b58 100644
--- a/src/encoded/schemas/supplementary_file.json
+++ b/src/encoded/schemas/supplementary_file.json
@@ -36,11 +36,7 @@
             "$ref": "mixins.json#/file_release"
         },
         {
-<<<<<<< HEAD
             "$ref": "mixins.json#/annotation"
-=======
-            "$ref": "mixins.json#/gene_annotation"
->>>>>>> main
         },
         {
             "$ref": "mixins.json#/modified"

From 09db0746934c5c250195ac234d985c63833b54fb Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Thu, 5 Dec 2024 14:05:39 -0500
Subject: [PATCH 12/17] take annotation off supplementary file

---
 src/encoded/schemas/supplementary_file.json | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json
index 2d6b04b58..a8bce5edf 100644
--- a/src/encoded/schemas/supplementary_file.json
+++ b/src/encoded/schemas/supplementary_file.json
@@ -35,9 +35,6 @@
         {
             "$ref": "mixins.json#/file_release"
         },
-        {
-            "$ref": "mixins.json#/annotation"
-        },
         {
             "$ref": "mixins.json#/modified"
         },

From 4a9f84e02b2e03ed4b28d856ab57615ccf210dc1 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Fri, 6 Dec 2024 11:12:57 -0500
Subject: [PATCH 13/17] Update release_file and make code non-unique

---
 src/encoded/commands/release_file.py     | 14 ++++++++++++++
 src/encoded/item_utils/constants/file.py |  1 +
 src/encoded/schemas/reference_file.json  |  8 ++++++++
 src/encoded/schemas/software.json        |  3 ++-
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/encoded/commands/release_file.py b/src/encoded/commands/release_file.py
index 49c9666e2..0ceebf4ea 100644
--- a/src/encoded/commands/release_file.py
+++ b/src/encoded/commands/release_file.py
@@ -454,6 +454,9 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
             IPSC: {
                 file_constants.DATA_CATEGORY_SEQUENCING_READS: (
@@ -465,6 +468,9 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_PROTECTED
                 ),
+                file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
             self.TISSUE: {
                 file_constants.DATA_CATEGORY_SEQUENCING_READS: (
@@ -476,12 +482,19 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
         }
         if dataset in [
             "colo829bl",
             "colo829t",
             "colo829blt_50to1",
+            "colo829blt_in_silico",
+            "colo829_snv_indel_challenge_data",
+            "hapmap_snv_indel_challenge_data",
+            "mei_detection_challenge_data",
             "hapmap",
             "hg002",
             "hg00438",
@@ -498,6 +511,7 @@ def get_access_status(self, dataset: str) -> str:
             "lb_ipsc_4",
             "lb_ipsc_52",
             "lb_ipsc_60",
+            "ipsc_snv_indel_challenge_data",
         ]:
             dataset_category = IPSC
         elif dataset == self.TISSUE:
diff --git a/src/encoded/item_utils/constants/file.py b/src/encoded/item_utils/constants/file.py
index 76374401e..054199cb7 100644
--- a/src/encoded/item_utils/constants/file.py
+++ b/src/encoded/item_utils/constants/file.py
@@ -6,6 +6,7 @@
 DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
 DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
 DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
+DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification"
 DATASET = "dataset"
 EXTRA_FILES = "extra_files"
 FILE_SETS = "file_sets"
diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json
index 55fb22c14..0c64abd1c 100644
--- a/src/encoded/schemas/reference_file.json
+++ b/src/encoded/schemas/reference_file.json
@@ -78,6 +78,14 @@
     "properties": {
         "schema_version": {
             "default": "2"
+        },
+        "code": {
+            "pattern": "^[A-Za-z0-9_]{2,}$",
+            "uniqueKey": false
+        },
+        "version": {
+            "description": "The version of the reference file",
+            "pattern": "[A-Za-z0-9._-]+"
         }
     }
 }
diff --git a/src/encoded/schemas/software.json b/src/encoded/schemas/software.json
index 1bd60a0f1..9ac40155d 100644
--- a/src/encoded/schemas/software.json
+++ b/src/encoded/schemas/software.json
@@ -125,7 +125,8 @@
             }
         },
         "code": {
-            "pattern": "^[A-Za-z0-9_]{2,}$"
+            "pattern": "^[A-Za-z0-9_]{2,}$",
+            "uniqueKey": false
         },
         "submitted_id": {
             "pattern": "^[A-Z0-9]{3,}_SOFTWARE_[A-Z0-9-_.]{4,}$",

From e020705baae82f8005afe5b5dabfb938f00bd8b5 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Fri, 6 Dec 2024 11:39:35 -0500
Subject: [PATCH 14/17] Make code separate from mixins

---
 .../commands/create_annotated_filenames.py    | 52 +++++++++++++++----
 src/encoded/schemas/reference_file.json       | 10 ++--
 src/encoded/schemas/software.json             | 10 ++--
 .../data/workbook-inserts/output_file.json    |  2 +-
 .../data/workbook-inserts/reference_file.json |  3 +-
 .../tests/data/workbook-inserts/software.json | 14 +++++
 src/encoded/tests/test_annotated_filename.py  | 12 ++---
 7 files changed, 76 insertions(+), 27 deletions(-)

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
index 20277e337..9ecbfb005 100644
--- a/src/encoded/commands/create_annotated_filenames.py
+++ b/src/encoded/commands/create_annotated_filenames.py
@@ -827,7 +827,7 @@ def get_analysis(
     """
     software_and_versions = get_software_and_versions(software)
     reference_genome_code = item_utils.get_code(reference_genome)
-    gene_annotation_code = get_gene_annotation_codes(gene_annotations)
+    gene_annotation_code = get_annotations_and_versions(gene_annotations)
     transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
     value = get_analysis_value(
         software_and_versions,
@@ -894,26 +894,60 @@ def get_analysis_value(
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
 
 
-def get_gene_annotation_codes(gene_annotations: List[Dict[str, Any]]) -> str:
-    """Get gene annotation codes for file.
+def get_annotations_and_versions(gene_annotations: List[Dict[str, Any]]) -> str:
+    """Get gene annotation codes and accompanying versions for file.
+
+    Currently only looking for items with codes, as these are
+    expected to be the annotations used for naming.
     """
-    codes = [item for item in gene_annotations if item_utils.get_code(item)]
-    if not codes:
+    annotations_with_codes = get_annotations_with_codes(gene_annotations)
+    if not annotations_with_codes:
         return ""
-    return get_gene_annotation_codes_string(codes)
+    annotations_with_codes_and_versions = get_annotations_with_versions(annotations_with_codes)
+    if len(annotations_with_codes) == len(annotations_with_codes_and_versions):
+        return get_annotations_and_versions_string(annotations_with_codes_and_versions)
+    missing_versions = get_annotation_codes_missing_versions(annotations_with_codes)
+    logger.warning(f"Missing versions for annotation items: {missing_versions}.")
+    return ""
+
+
+def get_annotations_with_codes(
+    annotation_items: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """Get annotation reference file items with codes."""
+    return [item for item in annotation_items if item_utils.get_code(item)]
 
 
-def get_gene_annotation_codes_string(annotation_items: List[Dict[str, Any]]) -> str:
-    """Get string representation of gene annotation codes."""
+def get_annotations_with_versions(
+    annotation_items: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """Get annotation reference file items with versions."""
+    return [item for item in annotation_items if item_utils.get_version(item)]
+
+
+def get_annotations_and_versions_string(annotation_items: List[Dict[str, Any]]) -> str:
+    """Get string representation of annotation code and versions."""
     sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code)
     return ANALYSIS_INFO_SEPARATOR.join(
         [
-            item_utils.get_code(item)
+            f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
+            f"{item_utils.get_version(item)}"
             for item in sorted_annotation_items
         ]
     )
 
 
+def get_annotation_codes_missing_versions(
+    annotation_items: List[Dict[str, Any]]
+) -> List[str]:
+    """Get annotation reference file items missing versions."""
+    return [
+        item_utils.get_code(item)
+        for item in annotation_items
+        if not item_utils.get_version(item)
+    ]
+
+
 def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
     """Get software and accompanying versions for file.
 
diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json
index 0c64abd1c..a2065a894 100644
--- a/src/encoded/schemas/reference_file.json
+++ b/src/encoded/schemas/reference_file.json
@@ -37,9 +37,6 @@
         {
             "$ref": "mixins.json#/attribution"
         },
-        {
-            "$ref": "mixins.json#/code"
-        },
         {
             "$ref": "mixins.json#/description"
         },
@@ -80,8 +77,11 @@
             "default": "2"
         },
         "code": {
-            "pattern": "^[A-Za-z0-9_]{2,}$",
-            "uniqueKey": false
+            "title": "Code",
+            "description": "Code used in file naming scheme",
+            "type": "string",
+            "permission": "restricted_fields",
+            "pattern": "^[A-Za-z0-9_]{2,}$"
         },
         "version": {
             "description": "The version of the reference file",
diff --git a/src/encoded/schemas/software.json b/src/encoded/schemas/software.json
index 9ac40155d..074a739d8 100644
--- a/src/encoded/schemas/software.json
+++ b/src/encoded/schemas/software.json
@@ -59,9 +59,6 @@
         {
             "$ref": "mixins.json#/category"
         },
-        {
-            "$ref": "mixins.json#/code"
-        },
         {
             "$ref": "mixins.json#/description"
         },
@@ -125,8 +122,11 @@
             }
         },
         "code": {
-            "pattern": "^[A-Za-z0-9_]{2,}$",
-            "uniqueKey": false
+            "title": "Code",
+            "description": "Code used in file naming scheme",
+            "type": "string",
+            "permission": "restricted_fields",
+            "pattern": "^[A-Za-z0-9_]{2,}$"
         },
         "submitted_id": {
             "pattern": "^[A-Z0-9]{3,}_SOFTWARE_[A-Z0-9-_.]{4,}$",
diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json
index 1aa9a7af9..dbec34abc 100644
--- a/src/encoded/tests/data/workbook-inserts/output_file.json
+++ b/src/encoded/tests/data/workbook-inserts/output_file.json
@@ -75,6 +75,6 @@
         ],
         "status": "released",
         "dataset": "colo829t",
-        "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam"
+        "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode_v45.aligned.sorted.phased.bam"
     }
 ]
diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json
index f89f0be2a..ba0330282 100644
--- a/src/encoded/tests/data/workbook-inserts/reference_file.json
+++ b/src/encoded/tests/data/workbook-inserts/reference_file.json
@@ -27,7 +27,8 @@
         "aliases": [
             "smaht:ReferenceFile-collapsed-genes-gencode_v45"
         ],
-        "code": "gencode45",
+        "code": "gencode",
+        "version": "v45",
         "data_type": [
             "Gene Model"
         ],
diff --git a/src/encoded/tests/data/workbook-inserts/software.json b/src/encoded/tests/data/workbook-inserts/software.json
index e75a29534..6de034d0d 100644
--- a/src/encoded/tests/data/workbook-inserts/software.json
+++ b/src/encoded/tests/data/workbook-inserts/software.json
@@ -27,6 +27,20 @@
         "code": "strelka",
         "version": "3.1.1"
     },
+    {
+        "uuid": "0e6ee3a4-2831-4ee4-b648-f53808282f38",
+        "submission_centers": [
+            "smaht"
+        ],
+        "submitted_id": "TEST_SOFTWARE_BWA-MEM_2.0.0",
+        "category": [
+            "Alignment"
+        ],
+        "name": "bwa_mem_v2",
+        "title": "BWA-MEM",
+        "code": "bwamem",
+        "version": "2.0.0"
+    },
     {
         "uuid": "be085e03-0989-4b44-81af-37efef5aa086",
         "submission_centers": [
diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py
index a6574a810..398c12845 100644
--- a/src/encoded/tests/test_annotated_filename.py
+++ b/src/encoded/tests/test_annotated_filename.py
@@ -527,10 +527,10 @@ def test_get_sequencing_center_code(
 ANOTHER_SOFTWARE = {"code": ANOTHER_SOFTWARE_CODE, "version": ANOTHER_SOFTWARE_VERSION}
 REFERENCE_GENOME_CODE = "GRCh38"
 TARGET_GENOME_CODE = "HELA_DSA"
-GENE_ANNOTATION_CODE = "gencode45"
-
+GENE_ANNOTATION_CODE = "gencode"
+GENE_ANNOTATION_VERSION = "v45"
 SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE}
-SOME_GENE_ANNOTATION = [{"code": GENE_ANNOTATION_CODE}]
+SOME_GENE_ANNOTATION = [{"code": GENE_ANNOTATION_CODE, "version": GENE_ANNOTATION_VERSION}]
 SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]}
 SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]}
 RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]}
@@ -646,7 +646,7 @@ def test_get_sequencing_center_code(
             SOME_REFERENCE_GENOME,
             SOME_GENE_ANNOTATION,
             TSV_FILE_EXTENSION,
-            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_gene",
+            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}_gene",
             False
         ),
         (
@@ -655,7 +655,7 @@ def test_get_sequencing_center_code(
             SOME_REFERENCE_GENOME,
             SOME_GENE_ANNOTATION,
             TSV_FILE_EXTENSION,
-            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_isoform",
+            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}_isoform",
             False
         ),
         (
@@ -673,7 +673,7 @@ def test_get_sequencing_center_code(
             SOME_REFERENCE_GENOME,
             SOME_GENE_ANNOTATION,
             SOME_FILE_EXTENSION,
-            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}",
+            f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}",
             False
         ),
         (

From ea6e7d39f78c465d3dc11f9562b1c2d326e69322 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Fri, 6 Dec 2024 14:55:34 -0500
Subject: [PATCH 15/17] Add title to reference file

---
 src/encoded/schemas/reference_file.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json
index a2065a894..4c652cae3 100644
--- a/src/encoded/schemas/reference_file.json
+++ b/src/encoded/schemas/reference_file.json
@@ -52,6 +52,9 @@
         {
             "$ref": "mixins.json#/tags"
         },
+        {
+            "$ref": "mixins.json#/title"
+        },
         {
             "$ref": "mixins.json#/url"
         },

From 9c20c23038c702e4b338f57aab5f6bc7c101e704 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Thu, 12 Dec 2024 10:48:04 -0500
Subject: [PATCH 16/17] embed reference file title, code, and version

---
 src/encoded/schemas/mixins.json  |  2 +-
 src/encoded/types/output_file.py | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json
index 9574b1af9..ad9e48741 100644
--- a/src/encoded/schemas/mixins.json
+++ b/src/encoded/schemas/mixins.json
@@ -77,7 +77,7 @@
     "annotation": {
         "annotation": {
             "title": "Gene Annotation",
-            "description": "Gene annotation used for gene or transcript quantification",
+            "description": "Gene annotation used for analysis",
             "type": "array",
             "items": {
                 "type": "string",
diff --git a/src/encoded/types/output_file.py b/src/encoded/types/output_file.py
index 601e603ce..5fdd38087 100644
--- a/src/encoded/types/output_file.py
+++ b/src/encoded/types/output_file.py
@@ -4,6 +4,14 @@
 from .file import File
 
 
+def _build_output_file_embedded_list():
+    """Embeds for search on cell cultures."""
+    return File.embedded_list + [
+        "annotation.code",
+        "annotation.version",
+        "annotation.title",
+    ]
+
 @collection(
     name="output-files",
     acl=ONLY_ADMIN_VIEW_ACL,
@@ -15,7 +23,7 @@
 class OutputFile(File):
     item_type = "output_file"
     schema = load_schema("encoded:schemas/output_file.json")
-    embedded_list = File.embedded_list
+    embedded_list = _build_output_file_embedded_list()
 
     # processed files don't want md5 as unique key
     def unique_keys(self, properties):

From 5c0db5116ea78287475d78fe67be29c82f8ecda9 Mon Sep 17 00:00:00 2001
From: sarahgonicholson <sarah_nicholson@hms.harvard.edu>
Date: Thu, 12 Dec 2024 12:51:22 -0500
Subject: [PATCH 17/17] Add title to reference file insert

---
 src/encoded/tests/data/workbook-inserts/reference_file.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json
index ba0330282..070e4a4b0 100644
--- a/src/encoded/tests/data/workbook-inserts/reference_file.json
+++ b/src/encoded/tests/data/workbook-inserts/reference_file.json
@@ -28,6 +28,7 @@
             "smaht:ReferenceFile-collapsed-genes-gencode_v45"
         ],
         "code": "gencode",
+        "title": "GENCODEv45",
         "version": "v45",
         "data_type": [
             "Gene Model"