SN Rnaseq filenames (#303)

* Start with rna-seq annotated filename functions * Add gene_annotation and update rnaseq filenames * Fix tests * commit new files * update consortia requirement * Fix annotated filename insert * remove files from gene annotation * Merge up-to-date with main * Add error message for no transcript info * Make annotation a reference file link * fix merge * take annotation off supplementary file * Update release_file and make code non-unique * Make code separate from mixins * Add title to reference file * embed reference file title, code, and version * Add title to reference file insert
smaht-dac · Dec 12, 2024 · 844748e · 844748e
1 parent 411e838
commit 844748e
Show file tree

Hide file tree

Showing 18 changed files with 188 additions and 34 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,14 @@ smaht-portal
 Change Log
 ----------
 
+0.118.0
+=======
+`PR 303 SN Rnaseq filenames </~https://github.com/smaht-dac/smaht-portal/pull/303>`
+
+* Make `annotation` link in OutputFile an array of links to ReferenceFile
+* Add `code` property to ReferenceFile
+
+
 0.117.1
 =======
 `PR 284: Bm nomenclature page3 </~https://github.com/smaht-dac/smaht-portal/pull/284>`_

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "encoded"
-version = "0.117.1"
+version = "0.118.0"
 description = "SMaHT Data Analysis Portal"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
@@ -87,7 +87,7 @@ class AssociatedItems:
     sequencing_center: Dict[str, Any]
     software: List[Dict[str, Any]]
     reference_genome: Dict[str, Any]
-    gene_annotation: Dict[str, Any]
+    gene_annotations: Dict[str, Any]
     file_sets: List[Dict[str, Any]]
     donor_specific_assembly: Dict[str, Any]
     assays: List[Dict[str, Any]]
@@ -114,7 +114,7 @@ def get_associated_items(
     file_format = get_file_format(file, request_handler)
     software = get_software(file, request_handler)
     reference_genome = get_reference_genome(file, request_handler)
-    gene_annotation = get_gene_annotation(file, request_handler)
+    gene_annotations = get_gene_annotations(file, request_handler)
     donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
     if donor_specific_assembly:
         file_sets=get_derived_from_file_sets(file, request_handler)
@@ -135,7 +135,7 @@ def get_associated_items(
         file_format=file_format,
         software=software,
         reference_genome=reference_genome,
-        gene_annotation=gene_annotation,
+        gene_annotations=gene_annotations,
         file_sets=file_sets,
         donor_specific_assembly=donor_specific_assembly,
         assays=assays,
@@ -215,11 +215,11 @@ def get_reference_genome(
     return get_item(file_utils.get_reference_genome(file), request_handler)
 
 
-def get_gene_annotation(
+def get_gene_annotations(
     file: Dict[str, Any], request_handler: RequestHandler
 ) -> Dict[str, Any]:
-    """Get gene annotation for file."""
-    return get_item(file_utils.get_gene_annotation(file), request_handler)
+    """Get gene annotations for file."""
+    return get_items(file_utils.get_annotation(file), request_handler)
 
 
 def get_software(
@@ -443,7 +443,7 @@ def get_annotated_filename(
         file,
         associated_items.software,
         associated_items.reference_genome,
-        associated_items.gene_annotation,
+        associated_items.gene_annotations,
         associated_items.file_format
     )
     errors = collect_errors(
@@ -817,7 +817,7 @@ def get_analysis(
     file: Dict[str, Any],
     software: List[Dict[str, Any]],
     reference_genome: Dict[str, Any],
-    gene_annotation: Dict[str, Any],
+    gene_annotations: Dict[str, Any],
     file_extension: Dict[str, Any],
 ) -> FilenamePart:
     """Get analysis info for file.
@@ -827,7 +827,7 @@ def get_analysis(
     """
     software_and_versions = get_software_and_versions(software)
     reference_genome_code = item_utils.get_code(reference_genome)
-    gene_annotation_code = item_utils.get_code(gene_annotation)
+    gene_annotation_code = get_annotations_and_versions(gene_annotations)
     transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
     value = get_analysis_value(
         software_and_versions,
@@ -894,6 +894,60 @@ def get_analysis_value(
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
 
 
+def get_annotations_and_versions(gene_annotations: List[Dict[str, Any]]) -> str:
+    """Get gene annotation codes and accompanying versions for file.
+
+    Currently only looking for items with codes, as these are
+    expected to be the annotations used for naming.
+    """
+    annotations_with_codes = get_annotations_with_codes(gene_annotations)
+    if not annotations_with_codes:
+        return ""
+    annotations_with_codes_and_versions = get_annotations_with_versions(annotations_with_codes)
+    if len(annotations_with_codes) == len(annotations_with_codes_and_versions):
+        return get_annotations_and_versions_string(annotations_with_codes_and_versions)
+    missing_versions = get_annotation_codes_missing_versions(annotations_with_codes)
+    logger.warning(f"Missing versions for annotation items: {missing_versions}.")
+    return ""
+
+
+def get_annotations_with_codes(
+    annotation_items: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """Get annotation reference file items with codes."""
+    return [item for item in annotation_items if item_utils.get_code(item)]
+
+
+def get_annotations_with_versions(
+    annotation_items: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """Get annotation reference file items with versions."""
+    return [item for item in annotation_items if item_utils.get_version(item)]
+
+
+def get_annotations_and_versions_string(annotation_items: List[Dict[str, Any]]) -> str:
+    """Get string representation of annotation code and versions."""
+    sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code)
+    return ANALYSIS_INFO_SEPARATOR.join(
+        [
+            f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
+            f"{item_utils.get_version(item)}"
+            for item in sorted_annotation_items
+        ]
+    )
+
+
+def get_annotation_codes_missing_versions(
+    annotation_items: List[Dict[str, Any]]
+) -> List[str]:
+    """Get annotation reference file items missing versions."""
+    return [
+        item_utils.get_code(item)
+        for item in annotation_items
+        if not item_utils.get_version(item)
+    ]
+
+
 def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
     """Get software and accompanying versions for file.
 

diff --git a/src/encoded/commands/release_file.py b/src/encoded/commands/release_file.py
@@ -454,6 +454,9 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
             IPSC: {
                 file_constants.DATA_CATEGORY_SEQUENCING_READS: (
@@ -465,6 +468,9 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_PROTECTED
                 ),
+                file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
             self.TISSUE: {
                 file_constants.DATA_CATEGORY_SEQUENCING_READS: (
@@ -476,12 +482,19 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
         }
         if dataset in [
             "colo829bl",
             "colo829t",
             "colo829blt_50to1",
+            "colo829blt_in_silico",
+            "colo829_snv_indel_challenge_data",
+            "hapmap_snv_indel_challenge_data",
+            "mei_detection_challenge_data",
             "hapmap",
             "hg002",
             "hg00438",
@@ -498,6 +511,7 @@ def get_access_status(self, dataset: str) -> str:
             "lb_ipsc_4",
             "lb_ipsc_52",
             "lb_ipsc_60",
+            "ipsc_snv_indel_challenge_data",
         ]:
             dataset_category = IPSC
         elif dataset == self.TISSUE:

diff --git a/src/encoded/item_utils/constants/file.py b/src/encoded/item_utils/constants/file.py
@@ -6,6 +6,7 @@
 DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
 DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
 DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
+DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification"
 DATASET = "dataset"
 EXTRA_FILES = "extra_files"
 FILE_SETS = "file_sets"

diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py
@@ -76,9 +76,9 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any
     return properties.get("reference_genome", "")
 
 
-def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
-    """Get gene annotation from properties."""
-    return properties.get("gene_annotation", "")
+def get_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
+    """Get annotation from properties."""
+    return properties.get("annotation", [])
 
 
 def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]:

diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json
@@ -74,6 +74,17 @@
             }
         }
     },
+    "annotation": {
+        "annotation": {
+            "title": "Gene Annotation",
+            "description": "Gene annotation used for analysis",
+            "type": "array",
+            "items": {
+                "type": "string",
+                "linkTo": "ReferenceFile"
+            }
+        }
+    },
     "attachment": {
         "attachment": {
             "title": "Attached File",

diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json
@@ -50,7 +50,7 @@
             "$ref": "mixins.json#/file_release"
         },
         {
-            "$ref": "mixins.json#/gene_annotation"
+            "$ref": "mixins.json#/annotation"
         },
         {
             "$ref": "mixins.json#/modified"

diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json
@@ -52,6 +52,9 @@
         {
             "$ref": "mixins.json#/tags"
         },
+        {
+            "$ref": "mixins.json#/title"
+        },
         {
             "$ref": "mixins.json#/url"
         },
@@ -75,6 +78,17 @@
     "properties": {
         "schema_version": {
             "default": "2"
+        },
+        "code": {
+            "title": "Code",
+            "description": "Code used in file naming scheme",
+            "type": "string",
+            "permission": "restricted_fields",
+            "pattern": "^[A-Za-z0-9_]{2,}$"
+        },
+        "version": {
+            "description": "The version of the reference file",
+            "pattern": "[A-Za-z0-9._-]+"
         }
     }
 }
diff --git a/src/encoded/schemas/software.json b/src/encoded/schemas/software.json
@@ -59,9 +59,6 @@
         {
             "$ref": "mixins.json#/category"
         },
-        {
-            "$ref": "mixins.json#/code"
-        },
         {
             "$ref": "mixins.json#/description"
         },
@@ -125,6 +122,10 @@
             }
         },
         "code": {
+            "title": "Code",
+            "description": "Code used in file naming scheme",
+            "type": "string",
+            "permission": "restricted_fields",
             "pattern": "^[A-Za-z0-9_]{2,}$"
         },
         "submitted_id": {

diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json
@@ -35,9 +35,6 @@
         {
             "$ref": "mixins.json#/file_release"
         },
-        {
-            "$ref": "mixins.json#/gene_annotation"
-        },
         {
             "$ref": "mixins.json#/modified"
         },

diff --git a/src/encoded/tests/data/workbook-inserts/file_format.json b/src/encoded/tests/data/workbook-inserts/file_format.json
@@ -80,5 +80,16 @@
             "ReferenceFile",
             "SupplementaryFile"
         ]
+    },
+    {
+        "uuid": "c3e54d5f-647c-4ca2-9b21-5c01caa3f691",
+        "submission_centers": [
+            "smaht"
+        ],
+        "identifier": "GTF",
+        "standard_file_extension": "gtf",
+        "valid_item_types": [
+            "ReferenceFile"
+        ]
     }
 ]
diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json
@@ -70,9 +70,11 @@
             "foo:software_vep"
         ],
         "reference_genome": "GRCh38",
-        "gene_annotation": "gencode45",
+        "annotation": [
+            "smaht:ReferenceFile-collapsed-genes-gencode_v45"
+        ],
         "status": "released",
         "dataset": "colo829t",
-        "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam"
+        "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode_v45.aligned.sorted.phased.bam"
     }
 ]
diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json
@@ -21,5 +21,24 @@
         ],
         "file_size": 8000,
         "status": "restricted"
+    },
+    {
+        "uuid": "8fc6f554-59c9-490a-b6dc-86665a0b971d",
+        "aliases": [
+            "smaht:ReferenceFile-collapsed-genes-gencode_v45"
+        ],
+        "code": "gencode",
+        "title": "GENCODEv45",
+        "version": "v45",
+        "data_type": [
+            "Gene Model"
+        ],
+        "data_category": [
+            "Genome Annotation"
+        ],
+        "file_format": "GTF",
+        "consortia": [
+            "smaht"
+        ]
     }
 ]
diff --git a/src/encoded/tests/data/workbook-inserts/software.json b/src/encoded/tests/data/workbook-inserts/software.json
@@ -27,6 +27,20 @@
         "code": "strelka",
         "version": "3.1.1"
     },
+    {
+        "uuid": "0e6ee3a4-2831-4ee4-b648-f53808282f38",
+        "submission_centers": [
+            "smaht"
+        ],
+        "submitted_id": "TEST_SOFTWARE_BWA-MEM_2.0.0",
+        "category": [
+            "Alignment"
+        ],
+        "name": "bwa_mem_v2",
+        "title": "BWA-MEM",
+        "code": "bwamem",
+        "version": "2.0.0"
+    },
     {
         "uuid": "be085e03-0989-4b44-81af-37efef5aa086",
         "submission_centers": [
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,9 +35,6 @@ @@
             {
                 "$ref": "mixins.json#/file_release"
             },
-            {
-                "$ref": "mixins.json#/gene_annotation"
-            },
             {
                 "$ref": "mixins.json#/modified"
             },
@@ Expand Down @@