From 8f9856e585616a460b5f31160f867f70c4a80c69 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 21 Nov 2024 16:29:38 -0500 Subject: [PATCH 01/17] Start with rna-seq annotated filename functions --- src/encoded/commands/create_annotated_filenames.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py index 3bda783d7..675a2d83a 100644 --- a/src/encoded/commands/create_annotated_filenames.py +++ b/src/encoded/commands/create_annotated_filenames.py @@ -914,6 +914,11 @@ def get_chain_file_value(file: Dict[str, Any]) -> str: return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly]) +def get_rna_seq_tsv_value(file: Dict[str, Any]) -> str: + """Get isoform or gene from description and gencode version for RNA-seq tsv and bam files.""" + # Use description and file format to determine with value + + def get_file_extension( file: Dict[str, Any], file_format: Dict[str, Any] ) -> FilenamePart: From 7fa4fab7165310e80703b6cc0636f4960383ced0 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 26 Nov 2024 13:55:02 -0500 Subject: [PATCH 02/17] Add gene_annotation and update rnaseq filenames --- .../commands/create_annotated_filenames.py | 41 ++++++++++-- src/encoded/item_utils/file.py | 12 +++- src/encoded/item_utils/file_format.py | 6 +- src/encoded/project/loadxl.py | 1 + src/encoded/schemas/mixins.json | 8 +++ src/encoded/schemas/output_file.json | 3 + src/encoded/schemas/supplementary_file.json | 3 + src/encoded/tests/test_annotated_filename.py | 65 ++++++++++++++++--- 8 files changed, 122 insertions(+), 17 deletions(-) diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py index 675a2d83a..60fa5ff03 100644 --- a/src/encoded/commands/create_annotated_filenames.py +++ b/src/encoded/commands/create_annotated_filenames.py @@ -84,6 +84,7 @@ class AssociatedItems: sequencing_center: Dict[str, Any] software: List[Dict[str, Any]] reference_genome: Dict[str, Any] + gene_annotation: Dict[str, Any] file_sets: List[Dict[str, Any]] donor_specific_assembly: Dict[str, Any] assays: List[Dict[str, Any]] @@ -110,6 +111,7 @@ def get_associated_items( file_format = get_file_format(file, request_handler) software = get_software(file, request_handler) reference_genome = get_reference_genome(file, request_handler) + gene_annotation = get_gene_annotation(file, request_handler) donor_specific_assembly = get_donor_specific_assembly(file, request_handler) if donor_specific_assembly: file_sets=get_derived_from_file_sets(file, request_handler) @@ -130,6 +132,7 @@ def get_associated_items( file_format=file_format, software=software, reference_genome=reference_genome, + gene_annotation=gene_annotation, file_sets=file_sets, donor_specific_assembly=donor_specific_assembly, assays=assays, @@ -209,6 +212,13 @@ def get_reference_genome( return get_item(file_utils.get_reference_genome(file), request_handler) +def get_gene_annotation( + file: Dict[str, Any], request_handler: RequestHandler +) -> Dict[str, Any]: + """Get gene annotation for file.""" + return get_item(file_utils.get_gene_annotation(file), request_handler) + + def get_software( file: Dict[str, Any], request_handler: RequestHandler ) -> List[Dict[str, Any]]: @@ -427,7 +437,11 @@ def get_annotated_filename( accession = get_accession(file) file_extension = get_file_extension(file, associated_items.file_format) analysis_info = get_analysis( - file, associated_items.software, associated_items.reference_genome,associated_items.file_format + file, + associated_items.software, + associated_items.reference_genome, + associated_items.gene_annotation, + associated_items.file_format ) errors = collect_errors( project_id, @@ -800,6 +814,7 @@ def get_analysis( file: Dict[str, Any], software: List[Dict[str, Any]], reference_genome: Dict[str, Any], + gene_annotation: Dict[str, Any], file_extension: Dict[str, Any], ) -> FilenamePart: """Get analysis info for file. @@ -809,14 +824,19 @@ def get_analysis( """ software_and_versions = get_software_and_versions(software) reference_genome_code = item_utils.get_code(reference_genome) + gene_annotation_code = item_utils.get_code(gene_annotation) errors = get_analysis_errors(file, reference_genome_code) if errors: return get_filename_part(errors=errors) value = get_analysis_value( - software_and_versions, reference_genome_code + software_and_versions, + reference_genome_code, + gene_annotation_code ) if file_format_utils.is_chain_file(file_extension): value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}" + elif file_format_utils.is_tsv_file(file_extension) and "RNA Quantification" in file_utils.get_data_category(file): + value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_rna_seq_tsv_value(file)}" if not value: if file_utils.is_unaligned_reads(file): # Think this is the only case (?) return get_filename_part(value=DEFAULT_ABSENT_FIELD) @@ -842,12 +862,14 @@ def get_analysis_errors( def get_analysis_value( - software_and_versions: str, reference_genome_code: str + software_and_versions: str, + reference_genome_code: str, + gene_annotation_code: str ) -> str: """Get analysis value for filename.""" to_write = [ string - for string in [software_and_versions, reference_genome_code] + for string in [software_and_versions, reference_genome_code, gene_annotation_code] if string ] return ANALYSIS_INFO_SEPARATOR.join(to_write) @@ -915,9 +937,14 @@ def get_chain_file_value(file: Dict[str, Any]) -> str: def get_rna_seq_tsv_value(file: Dict[str, Any]) -> str: - """Get isoform or gene from description and gencode version for RNA-seq tsv and bam files.""" - # Use description and file format to determine with value - + """Get isoform or gene from data type RNA-seq tsv files.""" + if "Gene Expression" in file_utils.get_data_type(file): + return "gene" + elif "Transcript Expression" in file_utils.get_data_type(file): + return "isoform" + else: + return "" + def get_file_extension( file: Dict[str, Any], file_format: Dict[str, Any] diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py index 595bae82f..22e29144c 100644 --- a/src/encoded/item_utils/file.py +++ b/src/encoded/item_utils/file.py @@ -76,6 +76,11 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any return properties.get("reference_genome", "") +def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]: + """Get gene annotation from properties.""" + return properties.get("gene_annotation", "") + + def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]: """Get file sets from properties.""" return properties.get("file_sets", []) @@ -425,4 +430,9 @@ def get_associated_files_status( def get_override_group_coverage(file: Dict[str, Any]) -> str: """Get override group coverage from properties.""" - return file.get("override_group_coverage","") \ No newline at end of file + return file.get("override_group_coverage","") + + +def is_rsem_tsv(properties: Dict[str, Any], request_handler: RequestHandler) -> bool: + """Check if file is an RSEM tsv output file.""" + return get_file_extension(request_handler,properties) == "tsv" and "RNA Quantification" in get_data_category(properties) \ No newline at end of file diff --git a/src/encoded/item_utils/file_format.py b/src/encoded/item_utils/file_format.py index bb1679e1e..8583c6180 100644 --- a/src/encoded/item_utils/file_format.py +++ b/src/encoded/item_utils/file_format.py @@ -10,4 +10,8 @@ def get_other_allowed_extensions(properties: Dict[str, Any]) -> str: def is_chain_file(properties: Dict[str, Any]) -> bool: - return get_standard_file_extension(properties) == "chain.gz" \ No newline at end of file + return get_standard_file_extension(properties) in ["chain.gz","chain"] + + +def is_tsv_file(properties: Dict[str, Any]) -> bool: + return get_standard_file_extension(properties) == "tsv" diff --git a/src/encoded/project/loadxl.py b/src/encoded/project/loadxl.py index 8274c7b9b..474bbf752 100644 --- a/src/encoded/project/loadxl.py +++ b/src/encoded/project/loadxl.py @@ -11,6 +11,7 @@ class SMaHTProjectLoadxl(SnovaultProjectLoadxl): "file_format", "quality_metric", "reference_genome", + "gene_annotation", "software", "tracking_item", "image", diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json index 140e44342..f28860b8a 100644 --- a/src/encoded/schemas/mixins.json +++ b/src/encoded/schemas/mixins.json @@ -364,6 +364,14 @@ "minimum": 1 } }, + "gene_annotation": { + "gene_annotation": { + "title": "Gene Annotation", + "description": "Gene annotation used for gene or transcript quantification", + "type": "string", + "linkTo": "GeneAnnotation" + } + }, "identifier": { "identifier": { "title": "Identifier", diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json index 7b422360e..23526f66b 100644 --- a/src/encoded/schemas/output_file.json +++ b/src/encoded/schemas/output_file.json @@ -49,6 +49,9 @@ { "$ref": "mixins.json#/file_release" }, + { + "$ref": "mixins.json#/gene_annotation" + }, { "$ref": "mixins.json#/modified" }, diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json index a8bce5edf..4528972c8 100644 --- a/src/encoded/schemas/supplementary_file.json +++ b/src/encoded/schemas/supplementary_file.json @@ -35,6 +35,9 @@ { "$ref": "mixins.json#/file_release" }, + { + "$ref": "mixins.json#/gene_annotation" + }, { "$ref": "mixins.json#/modified" }, diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py index 5b3176134..ccaca2878 100644 --- a/src/encoded/tests/test_annotated_filename.py +++ b/src/encoded/tests/test_annotated_filename.py @@ -527,8 +527,10 @@ def test_get_sequencing_center_code( ANOTHER_SOFTWARE = {"code": ANOTHER_SOFTWARE_CODE, "version": ANOTHER_SOFTWARE_VERSION} REFERENCE_GENOME_CODE = "GRCh38" TARGET_GENOME_CODE = "HELA_DSA" +GENE_ANNOTATION_CODE = "gencode45" SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE} +SOME_GENE_ANNOTATION = {"code": GENE_ANNOTATION_CODE} SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]} SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]} SOME_CHAIN_FILE = { @@ -536,6 +538,14 @@ def test_get_sequencing_center_code( "source_assembly": REFERENCE_GENOME_CODE, "target_assembly": TARGET_GENOME_CODE } +SOME_TSV_FILE = { + "data_type": ["Gene Expression"], + "data_category": ["RNA Quantification"] +} +SOME_ISOFORM_TSV_FILE = { + "data_type": ["Transcript Expression"], + "data_category": ["RNA Quantification"] +} SOME_SOMATIC_VARIANT_CALLS = {"data_category": ["Somatic Variant Calls"]} SOME_VARIANT_CALLS = { "data_category": ["Somatic Variant Calls"], @@ -556,37 +566,45 @@ def test_get_sequencing_center_code( "standard_file_extension": "chain.gz", "valid_item_types": ["SupplementaryFile"] } +TSV_FILE_EXTENSION = { + "identifier": "TSV", + "standard_file_extension": "tsv", + "valid_item_types": ["SupplementaryFile", "OutputFile"] +} @pytest.mark.parametrize( - "file,software,reference_genome,file_extension,expected,errors", + "file,software,reference_genome,gene_annotation,file_extension,expected,errors", [ - ({}, [], {}, {},"" , True), - (SOME_UNALIGNED_READS, [], {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False), + ({}, [], {}, {}, {},"" , True), + (SOME_UNALIGNED_READS, [], {}, {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False), ( SOME_UNALIGNED_READS, [SOME_SOFTWARE], {}, + {}, SOME_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}", False, ), - (SOME_UNALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, SOME_FILE_EXTENSION, "", True), - (SOME_ALIGNED_READS, [], {}, {},"", True), - (SOME_ALIGNED_READS, [SOME_SOFTWARE], {}, SOME_FILE_EXTENSION, "", True), + (SOME_UNALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, "", True), + (SOME_ALIGNED_READS, [], {}, {}, {},"", True), + (SOME_ALIGNED_READS, [SOME_SOFTWARE], {}, {}, SOME_FILE_EXTENSION, "", True), ( SOME_ALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, + {}, SOME_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, ), - (SOME_SOMATIC_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, VCF_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False), + (SOME_SOMATIC_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, VCF_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False), ( SOME_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, + {}, VCF_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, @@ -595,6 +613,7 @@ def test_get_sequencing_center_code( SOME_ALIGNED_READS, [SOME_SOFTWARE, ANOTHER_SOFTWARE], SOME_REFERENCE_GENOME, + {}, SOME_FILE_EXTENSION, f"{ANOTHER_SOFTWARE_CODE}_{ANOTHER_SOFTWARE_VERSION}_{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, @@ -603,6 +622,7 @@ def test_get_sequencing_center_code( SOME_ALIGNED_READS, [SOME_SOFTWARE, SOME_ITEM], SOME_REFERENCE_GENOME, + {}, SOME_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, @@ -611,22 +631,51 @@ def test_get_sequencing_center_code( SOME_CHAIN_FILE, [SOME_SOFTWARE, SOME_ITEM], {}, + {}, CHAIN_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}To{TARGET_GENOME_CODE}", False, ), + ( + SOME_TSV_FILE, + [SOME_SOFTWARE], + SOME_REFERENCE_GENOME, + SOME_GENE_ANNOTATION, + TSV_FILE_EXTENSION, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_gene", + False + ), + ( + SOME_ISOFORM_TSV_FILE, + [SOME_SOFTWARE], + SOME_REFERENCE_GENOME, + SOME_GENE_ANNOTATION, + TSV_FILE_EXTENSION, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_isoform", + False + ), + ( + SOME_ALIGNED_READS, + [SOME_SOFTWARE], + SOME_REFERENCE_GENOME, + SOME_GENE_ANNOTATION, + SOME_FILE_EXTENSION, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}", + False + ) ], ) def test_get_analysis( file: Dict[str, Any], software: List[Dict[str, Any]], reference_genome: Dict[str, Any], + gene_annotation: Dict[str, Any], file_extension: Dict[str, Any], expected: str, errors: bool, ) -> None: """Test analysis info retrieval for annotated filenames.""" - result = get_analysis(file, software, reference_genome, file_extension) + result = get_analysis(file, software, reference_genome, gene_annotation, file_extension) assert_filename_part_matches(result, expected, errors) From a8a2208f714df6527e316ad3e51ddb03eb8e3747 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 26 Nov 2024 15:16:29 -0500 Subject: [PATCH 03/17] Fix tests --- src/encoded/tests/data/workbook-inserts/output_file.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json index cd93cf536..b04c8f60e 100644 --- a/src/encoded/tests/data/workbook-inserts/output_file.json +++ b/src/encoded/tests/data/workbook-inserts/output_file.json @@ -70,6 +70,7 @@ "foo:software_vep" ], "reference_genome": "GRCh38", + "gene_annotation": "gencode45", "status": "released", "dataset": "colo829t", "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38.aligned.sorted.phased.bam" From dec92ca5d07b09d742acfbe96e868fc4c20324d1 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 26 Nov 2024 15:22:05 -0500 Subject: [PATCH 04/17] commit new files --- src/encoded/schemas/gene_annotation.json | 113 ++++++++++++++++++ .../workbook-inserts/gene_annotation.json | 10 ++ src/encoded/types/gene_annotation.py | 21 ++++ 3 files changed, 144 insertions(+) create mode 100644 src/encoded/schemas/gene_annotation.json create mode 100644 src/encoded/tests/data/workbook-inserts/gene_annotation.json create mode 100644 src/encoded/types/gene_annotation.py diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json new file mode 100644 index 000000000..c9b404078 --- /dev/null +++ b/src/encoded/schemas/gene_annotation.json @@ -0,0 +1,113 @@ +{ + "title": "Gene Annotation", + "$id": "/profiles/gene_annotation.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "required": [ + "identifier", + "title" + ], + "anyOf": [ + { + "allOf": [ + {"required": ["submission_centers"]}, + {"required": ["consortia"]} + ] + }, + { + "required": [ + "consortia" + ] + } + ], + "identifyingProperties": [ + "accession", + "aliases", + "identifier", + "uuid" + ], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/accession" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/attribution" + }, + { + "$ref": "mixins.json#/code" + }, + { + "$ref": "mixins.json#/description" + }, + { + "$ref": "mixins.json#/identifier" + }, + { + "$ref": "mixins.json#/modified" + }, + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/status" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/tags" + }, + { + "$ref": "mixins.json#/title" + }, + { + "$ref": "mixins.json#/url" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/version" + } + ], + "properties": { + "accession": { + "accessionType": "GA" + }, + "code": { + "pattern": "^[A-Za-z0-9]{3,}$" + }, + "schema_version": { + "default": "1" + }, + "version": { + "pattern": "^[A-Za-z0-9_-.]{3,}$" + }, + "files": { + "title": "Files", + "description": "Files associated with the gene annotation", + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "string", + "linkTo": "File" + } + } + }, + "columns": { + "identifier": { + "title": "Identifier" + }, + "code": { + "title": "Code" + }, + "title": { + "title": "Title" + } + } +} \ No newline at end of file diff --git a/src/encoded/tests/data/workbook-inserts/gene_annotation.json b/src/encoded/tests/data/workbook-inserts/gene_annotation.json new file mode 100644 index 000000000..19bb20293 --- /dev/null +++ b/src/encoded/tests/data/workbook-inserts/gene_annotation.json @@ -0,0 +1,10 @@ +[ + { + "identifier": "gencode45", + "title": "GENCODE v45", + "code": "gencode45", + "consortia": [ + "smaht" + ] + } +] diff --git a/src/encoded/types/gene_annotation.py b/src/encoded/types/gene_annotation.py new file mode 100644 index 000000000..961f6e5b5 --- /dev/null +++ b/src/encoded/types/gene_annotation.py @@ -0,0 +1,21 @@ +from snovault import collection, load_schema + +from .base import Item + +def _build_gene_annotation_embedded_list(): + """Embeds for search on gene annotations.""" + return [] + + +@collection( + name="gene-annotations", + unique_key="gene_annotation:identifier", + properties={ + "title": "Gene Annotations", + "description": "Gene annotations for gene and transcript quantification", + }, +) +class GeneAnnotation(Item): + item_type = "gene_annotation" + schema = load_schema("encoded:schemas/gene_annotation.json") + embedded_list = _build_gene_annotation_embedded_list() From cb10cb300b06ae51f4f2542a124ca388522a0eb7 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 26 Nov 2024 15:37:30 -0500 Subject: [PATCH 05/17] update consortia requirement --- src/encoded/schemas/gene_annotation.json | 5 ++--- src/encoded/tests/data/workbook-inserts/gene_annotation.json | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json index c9b404078..b46715c6a 100644 --- a/src/encoded/schemas/gene_annotation.json +++ b/src/encoded/schemas/gene_annotation.json @@ -9,9 +9,8 @@ ], "anyOf": [ { - "allOf": [ - {"required": ["submission_centers"]}, - {"required": ["consortia"]} + "required": [ + "submission_centers" ] }, { diff --git a/src/encoded/tests/data/workbook-inserts/gene_annotation.json b/src/encoded/tests/data/workbook-inserts/gene_annotation.json index 19bb20293..79ddcacd3 100644 --- a/src/encoded/tests/data/workbook-inserts/gene_annotation.json +++ b/src/encoded/tests/data/workbook-inserts/gene_annotation.json @@ -2,7 +2,7 @@ { "identifier": "gencode45", "title": "GENCODE v45", - "code": "gencode45", + "code": "gencode45", "consortia": [ "smaht" ] From 5cf2fea00d1b1090e06b777fd2f265a400f21b69 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 26 Nov 2024 15:52:13 -0500 Subject: [PATCH 06/17] Fix annotated filename insert --- src/encoded/tests/data/workbook-inserts/output_file.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json index b04c8f60e..cfd6e1e59 100644 --- a/src/encoded/tests/data/workbook-inserts/output_file.json +++ b/src/encoded/tests/data/workbook-inserts/output_file.json @@ -73,6 +73,6 @@ "gene_annotation": "gencode45", "status": "released", "dataset": "colo829t", - "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38.aligned.sorted.phased.bam" + "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam" } ] From de202b15ea5bf65ffadf184ff374a4e2ddfce83a Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Mon, 2 Dec 2024 13:03:17 -0500 Subject: [PATCH 07/17] remove files from gene annotation --- src/encoded/schemas/gene_annotation.json | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json index b46715c6a..45d6b6d12 100644 --- a/src/encoded/schemas/gene_annotation.json +++ b/src/encoded/schemas/gene_annotation.json @@ -85,17 +85,6 @@ }, "version": { "pattern": "^[A-Za-z0-9_-.]{3,}$" - }, - "files": { - "title": "Files", - "description": "Files associated with the gene annotation", - "type": "array", - "minItems": 1, - "uniqueItems": true, - "items": { - "type": "string", - "linkTo": "File" - } } }, "columns": { From a47704d501d16f436e4aad893827bd153ac83941 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 3 Dec 2024 12:02:34 -0500 Subject: [PATCH 08/17] Merge up-to-date with main --- CHANGELOG.rst | 8 ++++++++ pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c5d81b907..0b8468651 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ smaht-portal Change Log ---------- +0.116.0 +======= +`PR 299 SN RNA-seq filenames ` + +* Create new item GeneAnnotation that OutputFile and SupplementaryFile link to with property `gene_annotation` +* Update `commands/create_annotated_filenames.py` to include gencode version and gene/isoform information for RSEM tsv output files and RNA-seq aligned bams + + 0.115.0 ======= `PR 296 SN Sequencing validation ` diff --git a/pyproject.toml b/pyproject.toml index 5b006392c..74b28f921 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "encoded" -version = "0.115.0" +version = "0.116.0" description = "SMaHT Data Analysis Portal" authors = ["4DN-DCIC Team "] license = "MIT" From 393b526896d960bb083ed4028489407debf8bf37 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Wed, 4 Dec 2024 14:19:27 -0500 Subject: [PATCH 09/17] Add error message for no transcript info --- .../commands/create_annotated_filenames.py | 50 +++++++++++++------ src/encoded/item_utils/file.py | 5 -- src/encoded/tests/test_annotated_filename.py | 24 ++++++++- 3 files changed, 58 insertions(+), 21 deletions(-) diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py index 60fa5ff03..f0eecc3e1 100644 --- a/src/encoded/commands/create_annotated_filenames.py +++ b/src/encoded/commands/create_annotated_filenames.py @@ -33,6 +33,9 @@ ANALYSIS_INFO_SEPARATOR = "_" CHAIN_FILE_INFO_SEPARATOR = "To" +RNA_DATA_CATEGORY = "RNA Quantification" +GENE_DATA_TYPE = "Gene Expression" +ISOFORM_DATA_TYPE = "Transcript Expression" DEFAULT_PROJECT_ID = constants.PRODUCTION_PREFIX DEFAULT_ABSENT_FIELD = "X" @@ -825,18 +828,24 @@ def get_analysis( software_and_versions = get_software_and_versions(software) reference_genome_code = item_utils.get_code(reference_genome) gene_annotation_code = item_utils.get_code(gene_annotation) - errors = get_analysis_errors(file, reference_genome_code) - if errors: - return get_filename_part(errors=errors) + transcript_info_code = get_rna_seq_tsv_value(file, file_extension) value = get_analysis_value( software_and_versions, reference_genome_code, - gene_annotation_code + gene_annotation_code, + transcript_info_code ) if file_format_utils.is_chain_file(file_extension): value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}" - elif file_format_utils.is_tsv_file(file_extension) and "RNA Quantification" in file_utils.get_data_category(file): - value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_rna_seq_tsv_value(file)}" + errors = get_analysis_errors( + file, + reference_genome_code, + gene_annotation_code, + transcript_info_code, + file_extension, + ) + if errors: + return get_filename_part(errors=errors) if not value: if file_utils.is_unaligned_reads(file): # Think this is the only case (?) return get_filename_part(value=DEFAULT_ABSENT_FIELD) @@ -845,7 +854,11 @@ def get_analysis( def get_analysis_errors( - file: Dict[str, Any], reference_genome_code: str + file: Dict[str, Any], + reference_genome_code: str, + gene_annotation_code: str, + transcript_info_code: str, + file_extension: Dict[str, Any] ) -> List[str]: """Get analysis errors for file by file type.""" errors = [] @@ -858,18 +871,24 @@ def get_analysis_errors( if file_utils.is_variant_calls(file): if not reference_genome_code: errors.append("No reference genome code found") + if RNA_DATA_CATEGORY in file_utils.get_data_category(file): + if not gene_annotation_code: + errors.append("No gene annotation code found") + elif file_format_utils.is_tsv_file(file_extension) and not transcript_info_code: + errors.append("No gene or isoform code found") return errors def get_analysis_value( software_and_versions: str, reference_genome_code: str, - gene_annotation_code: str + gene_annotation_code: str, + transcript_info_code: str ) -> str: """Get analysis value for filename.""" to_write = [ string - for string in [software_and_versions, reference_genome_code, gene_annotation_code] + for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code] if string ] return ANALYSIS_INFO_SEPARATOR.join(to_write) @@ -936,12 +955,13 @@ def get_chain_file_value(file: Dict[str, Any]) -> str: return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly]) -def get_rna_seq_tsv_value(file: Dict[str, Any]) -> str: - """Get isoform or gene from data type RNA-seq tsv files.""" - if "Gene Expression" in file_utils.get_data_type(file): - return "gene" - elif "Transcript Expression" in file_utils.get_data_type(file): - return "isoform" +def get_rna_seq_tsv_value(file: Dict[str, Any], file_extension: Dict[str, Any]) -> str: + """Get isoform or gene from data type for RNA-seq tsv files.""" + if file_format_utils.is_tsv_file(file_extension) and RNA_DATA_CATEGORY in file_utils.get_data_category(file): + if GENE_DATA_TYPE in file_utils.get_data_type(file): + return "gene" + elif ISOFORM_DATA_TYPE in file_utils.get_data_type(file): + return "isoform" else: return "" diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py index 22e29144c..e40d3cc22 100644 --- a/src/encoded/item_utils/file.py +++ b/src/encoded/item_utils/file.py @@ -431,8 +431,3 @@ def get_associated_files_status( def get_override_group_coverage(file: Dict[str, Any]) -> str: """Get override group coverage from properties.""" return file.get("override_group_coverage","") - - -def is_rsem_tsv(properties: Dict[str, Any], request_handler: RequestHandler) -> bool: - """Check if file is an RSEM tsv output file.""" - return get_file_extension(request_handler,properties) == "tsv" and "RNA Quantification" in get_data_category(properties) \ No newline at end of file diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py index ccaca2878..c2f7b2d63 100644 --- a/src/encoded/tests/test_annotated_filename.py +++ b/src/encoded/tests/test_annotated_filename.py @@ -533,6 +533,7 @@ def test_get_sequencing_center_code( SOME_GENE_ANNOTATION = {"code": GENE_ANNOTATION_CODE} SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]} SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]} +RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]} SOME_CHAIN_FILE = { "data_type": ["SupplementaryFile"], "source_assembly": REFERENCE_GENOME_CODE, @@ -542,6 +543,9 @@ def test_get_sequencing_center_code( "data_type": ["Gene Expression"], "data_category": ["RNA Quantification"] } +SOME_OTHER_FILE = { + "data_category": ["RNA Quantification"] +} SOME_ISOFORM_TSV_FILE = { "data_type": ["Transcript Expression"], "data_category": ["RNA Quantification"] @@ -645,7 +649,7 @@ def test_get_sequencing_center_code( f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_gene", False ), - ( + ( SOME_ISOFORM_TSV_FILE, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, @@ -654,6 +658,15 @@ def test_get_sequencing_center_code( f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_isoform", False ), + ( + SOME_OTHER_FILE, + [SOME_SOFTWARE], + SOME_REFERENCE_GENOME, + SOME_GENE_ANNOTATION, + TSV_FILE_EXTENSION, + "", + True + ), ( SOME_ALIGNED_READS, [SOME_SOFTWARE], @@ -662,6 +675,15 @@ def test_get_sequencing_center_code( SOME_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}", False + ), + ( + RNA_ALIGNED_READS, + [SOME_SOFTWARE], + SOME_REFERENCE_GENOME, + {}, + SOME_FILE_EXTENSION, + "", + True ) ], ) From c275ce45b69d377254e3c18322b1dbc9a7ec0e69 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 5 Dec 2024 13:04:46 -0500 Subject: [PATCH 10/17] Make annotation a reference file link --- .../commands/create_annotated_filenames.py | 38 +++++-- src/encoded/item_utils/file.py | 6 +- src/encoded/project/loadxl.py | 1 - src/encoded/schemas/gene_annotation.json | 101 ------------------ src/encoded/schemas/mixins.json | 19 ++-- src/encoded/schemas/output_file.json | 2 +- src/encoded/schemas/reference_file.json | 3 + src/encoded/schemas/supplementary_file.json | 2 +- .../data/workbook-inserts/file_format.json | 11 ++ .../workbook-inserts/gene_annotation.json | 10 -- .../data/workbook-inserts/output_file.json | 4 +- .../data/workbook-inserts/reference_file.json | 17 +++ src/encoded/tests/test_annotated_filename.py | 8 +- .../tests/test_metadata_tsv_workbook.py | 4 +- src/encoded/types/gene_annotation.py | 21 ---- 15 files changed, 85 insertions(+), 162 deletions(-) delete mode 100644 src/encoded/schemas/gene_annotation.json delete mode 100644 src/encoded/tests/data/workbook-inserts/gene_annotation.json delete mode 100644 src/encoded/types/gene_annotation.py diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py index f0eecc3e1..20277e337 100644 --- a/src/encoded/commands/create_annotated_filenames.py +++ b/src/encoded/commands/create_annotated_filenames.py @@ -87,7 +87,7 @@ class AssociatedItems: sequencing_center: Dict[str, Any] software: List[Dict[str, Any]] reference_genome: Dict[str, Any] - gene_annotation: Dict[str, Any] + gene_annotations: Dict[str, Any] file_sets: List[Dict[str, Any]] donor_specific_assembly: Dict[str, Any] assays: List[Dict[str, Any]] @@ -114,7 +114,7 @@ def get_associated_items( file_format = get_file_format(file, request_handler) software = get_software(file, request_handler) reference_genome = get_reference_genome(file, request_handler) - gene_annotation = get_gene_annotation(file, request_handler) + gene_annotations = get_gene_annotations(file, request_handler) donor_specific_assembly = get_donor_specific_assembly(file, request_handler) if donor_specific_assembly: file_sets=get_derived_from_file_sets(file, request_handler) @@ -135,7 +135,7 @@ def get_associated_items( file_format=file_format, software=software, reference_genome=reference_genome, - gene_annotation=gene_annotation, + gene_annotations=gene_annotations, file_sets=file_sets, donor_specific_assembly=donor_specific_assembly, assays=assays, @@ -215,11 +215,11 @@ def get_reference_genome( return get_item(file_utils.get_reference_genome(file), request_handler) -def get_gene_annotation( +def get_gene_annotations( file: Dict[str, Any], request_handler: RequestHandler ) -> Dict[str, Any]: - """Get gene annotation for file.""" - return get_item(file_utils.get_gene_annotation(file), request_handler) + """Get gene annotations for file.""" + return get_items(file_utils.get_annotation(file), request_handler) def get_software( @@ -443,7 +443,7 @@ def get_annotated_filename( file, associated_items.software, associated_items.reference_genome, - associated_items.gene_annotation, + associated_items.gene_annotations, associated_items.file_format ) errors = collect_errors( @@ -817,7 +817,7 @@ def get_analysis( file: Dict[str, Any], software: List[Dict[str, Any]], reference_genome: Dict[str, Any], - gene_annotation: Dict[str, Any], + gene_annotations: Dict[str, Any], file_extension: Dict[str, Any], ) -> FilenamePart: """Get analysis info for file. @@ -827,7 +827,7 @@ def get_analysis( """ software_and_versions = get_software_and_versions(software) reference_genome_code = item_utils.get_code(reference_genome) - gene_annotation_code = item_utils.get_code(gene_annotation) + gene_annotation_code = get_gene_annotation_codes(gene_annotations) transcript_info_code = get_rna_seq_tsv_value(file, file_extension) value = get_analysis_value( software_and_versions, @@ -894,6 +894,26 @@ def get_analysis_value( return ANALYSIS_INFO_SEPARATOR.join(to_write) +def get_gene_annotation_codes(gene_annotations: List[Dict[str, Any]]) -> str: + """Get gene annotation codes for file. + """ + codes = [item for item in gene_annotations if item_utils.get_code(item)] + if not codes: + return "" + return get_gene_annotation_codes_string(codes) + + +def get_gene_annotation_codes_string(annotation_items: List[Dict[str, Any]]) -> str: + """Get string representation of gene annotation codes.""" + sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code) + return ANALYSIS_INFO_SEPARATOR.join( + [ + item_utils.get_code(item) + for item in sorted_annotation_items + ] + ) + + def get_software_and_versions(software: List[Dict[str, Any]]) -> str: """Get software and accompanying versions for file. diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py index e40d3cc22..777ec1747 100644 --- a/src/encoded/item_utils/file.py +++ b/src/encoded/item_utils/file.py @@ -76,9 +76,9 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any return properties.get("reference_genome", "") -def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]: - """Get gene annotation from properties.""" - return properties.get("gene_annotation", "") +def get_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]: + """Get annotation from properties.""" + return properties.get("annotation", []) def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]: diff --git a/src/encoded/project/loadxl.py b/src/encoded/project/loadxl.py index 474bbf752..8274c7b9b 100644 --- a/src/encoded/project/loadxl.py +++ b/src/encoded/project/loadxl.py @@ -11,7 +11,6 @@ class SMaHTProjectLoadxl(SnovaultProjectLoadxl): "file_format", "quality_metric", "reference_genome", - "gene_annotation", "software", "tracking_item", "image", diff --git a/src/encoded/schemas/gene_annotation.json b/src/encoded/schemas/gene_annotation.json deleted file mode 100644 index 45d6b6d12..000000000 --- a/src/encoded/schemas/gene_annotation.json +++ /dev/null @@ -1,101 +0,0 @@ -{ - "title": "Gene Annotation", - "$id": "/profiles/gene_annotation.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "type": "object", - "required": [ - "identifier", - "title" - ], - "anyOf": [ - { - "required": [ - "submission_centers" - ] - }, - { - "required": [ - "consortia" - ] - } - ], - "identifyingProperties": [ - "accession", - "aliases", - "identifier", - "uuid" - ], - "additionalProperties": false, - "mixinProperties": [ - { - "$ref": "mixins.json#/accession" - }, - { - "$ref": "mixins.json#/aliases" - }, - { - "$ref": "mixins.json#/attribution" - }, - { - "$ref": "mixins.json#/code" - }, - { - "$ref": "mixins.json#/description" - }, - { - "$ref": "mixins.json#/identifier" - }, - { - "$ref": "mixins.json#/modified" - }, - { - "$ref": "mixins.json#/schema_version" - }, - { - "$ref": "mixins.json#/status" - }, - { - "$ref": "mixins.json#/submitted" - }, - { - "$ref": "mixins.json#/tags" - }, - { - "$ref": "mixins.json#/title" - }, - { - "$ref": "mixins.json#/url" - }, - { - "$ref": "mixins.json#/uuid" - }, - { - "$ref": "mixins.json#/version" - } - ], - "properties": { - "accession": { - "accessionType": "GA" - }, - "code": { - "pattern": "^[A-Za-z0-9]{3,}$" - }, - "schema_version": { - "default": "1" - }, - "version": { - "pattern": "^[A-Za-z0-9_-.]{3,}$" - } - }, - "columns": { - "identifier": { - "title": "Identifier" - }, - "code": { - "title": "Code" - }, - "title": { - "title": "Title" - } - } -} \ No newline at end of file diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json index f28860b8a..4d74dab42 100644 --- a/src/encoded/schemas/mixins.json +++ b/src/encoded/schemas/mixins.json @@ -74,6 +74,17 @@ } } }, + "annotation": { + "annotation": { + "title": "Gene Annotation", + "description": "Gene annotation used for gene or transcript quantification", + "type": "array", + "items": { + "type": "string", + "linkTo": "ReferenceFile" + } + } + }, "attachment": { "attachment": { "title": "Attached File", @@ -364,14 +375,6 @@ "minimum": 1 } }, - "gene_annotation": { - "gene_annotation": { - "title": "Gene Annotation", - "description": "Gene annotation used for gene or transcript quantification", - "type": "string", - "linkTo": "GeneAnnotation" - } - }, "identifier": { "identifier": { "title": "Identifier", diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json index 23526f66b..9ef4237a0 100644 --- a/src/encoded/schemas/output_file.json +++ b/src/encoded/schemas/output_file.json @@ -50,7 +50,7 @@ "$ref": "mixins.json#/file_release" }, { - "$ref": "mixins.json#/gene_annotation" + "$ref": "mixins.json#/annotation" }, { "$ref": "mixins.json#/modified" diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json index c6c74c99a..55fb22c14 100644 --- a/src/encoded/schemas/reference_file.json +++ b/src/encoded/schemas/reference_file.json @@ -37,6 +37,9 @@ { "$ref": "mixins.json#/attribution" }, + { + "$ref": "mixins.json#/code" + }, { "$ref": "mixins.json#/description" }, diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json index 4528972c8..2d6b04b58 100644 --- a/src/encoded/schemas/supplementary_file.json +++ b/src/encoded/schemas/supplementary_file.json @@ -36,7 +36,7 @@ "$ref": "mixins.json#/file_release" }, { - "$ref": "mixins.json#/gene_annotation" + "$ref": "mixins.json#/annotation" }, { "$ref": "mixins.json#/modified" diff --git a/src/encoded/tests/data/workbook-inserts/file_format.json b/src/encoded/tests/data/workbook-inserts/file_format.json index 8c1b40b82..baf9f0bd7 100644 --- a/src/encoded/tests/data/workbook-inserts/file_format.json +++ b/src/encoded/tests/data/workbook-inserts/file_format.json @@ -80,5 +80,16 @@ "ReferenceFile", "SupplementaryFile" ] + }, + { + "uuid": "c3e54d5f-647c-4ca2-9b21-5c01caa3f691", + "submission_centers": [ + "smaht" + ], + "identifier": "GTF", + "standard_file_extension": "gtf", + "valid_item_types": [ + "ReferenceFile" + ] } ] \ No newline at end of file diff --git a/src/encoded/tests/data/workbook-inserts/gene_annotation.json b/src/encoded/tests/data/workbook-inserts/gene_annotation.json deleted file mode 100644 index 79ddcacd3..000000000 --- a/src/encoded/tests/data/workbook-inserts/gene_annotation.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "identifier": "gencode45", - "title": "GENCODE v45", - "code": "gencode45", - "consortia": [ - "smaht" - ] - } -] diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json index 8a8d07a6d..1aa9a7af9 100644 --- a/src/encoded/tests/data/workbook-inserts/output_file.json +++ b/src/encoded/tests/data/workbook-inserts/output_file.json @@ -70,7 +70,9 @@ "foo:software_vep" ], "reference_genome": "GRCh38", - "gene_annotation": "gencode45", + "annotation": [ + "smaht:ReferenceFile-collapsed-genes-gencode_v45" + ], "status": "released", "dataset": "colo829t", "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam" diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json index a0b54987d..f89f0be2a 100644 --- a/src/encoded/tests/data/workbook-inserts/reference_file.json +++ b/src/encoded/tests/data/workbook-inserts/reference_file.json @@ -21,5 +21,22 @@ ], "file_size": 8000, "status": "restricted" + }, + { + "uuid": "8fc6f554-59c9-490a-b6dc-86665a0b971d", + "aliases": [ + "smaht:ReferenceFile-collapsed-genes-gencode_v45" + ], + "code": "gencode45", + "data_type": [ + "Gene Model" + ], + "data_category": [ + "Genome Annotation" + ], + "file_format": "GTF", + "consortia": [ + "smaht" + ] } ] \ No newline at end of file diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py index c2f7b2d63..a6574a810 100644 --- a/src/encoded/tests/test_annotated_filename.py +++ b/src/encoded/tests/test_annotated_filename.py @@ -530,7 +530,7 @@ def test_get_sequencing_center_code( GENE_ANNOTATION_CODE = "gencode45" SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE} -SOME_GENE_ANNOTATION = {"code": GENE_ANNOTATION_CODE} +SOME_GENE_ANNOTATION = [{"code": GENE_ANNOTATION_CODE}] SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]} SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]} RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]} @@ -578,7 +578,7 @@ def test_get_sequencing_center_code( @pytest.mark.parametrize( - "file,software,reference_genome,gene_annotation,file_extension,expected,errors", + "file,software,reference_genome,annotation,file_extension,expected,errors", [ ({}, [], {}, {}, {},"" , True), (SOME_UNALIGNED_READS, [], {}, {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False), @@ -691,13 +691,13 @@ def test_get_analysis( file: Dict[str, Any], software: List[Dict[str, Any]], reference_genome: Dict[str, Any], - gene_annotation: Dict[str, Any], + annotation: Dict[str, Any], file_extension: Dict[str, Any], expected: str, errors: bool, ) -> None: """Test analysis info retrieval for annotated filenames.""" - result = get_analysis(file, software, reference_genome, gene_annotation, file_extension) + result = get_analysis(file, software, reference_genome, annotation, file_extension) assert_filename_part_matches(result, expected, errors) diff --git a/src/encoded/tests/test_metadata_tsv_workbook.py b/src/encoded/tests/test_metadata_tsv_workbook.py index ed8d3002f..37cff652d 100644 --- a/src/encoded/tests/test_metadata_tsv_workbook.py +++ b/src/encoded/tests/test_metadata_tsv_workbook.py @@ -118,12 +118,12 @@ def test_metadata_tsv_workbook(self, workbook, es_testapp): TestMetadataTSVHelper.check_key_and_length(header1, 'Metadata TSV Download') TestMetadataTSVHelper.check_key_and_length(header2, 'Suggested command to download: ') TestMetadataTSVHelper.check_key_and_length(header3, 'FileDownloadURL') - assert len(parsed[3:]) == 19 # there are 19 entries in the workbook right now, including extra files + assert len(parsed[3:]) == 20 # there are 20 entries in the workbook right now, including extra files # test for various types TestMetadataTSVHelper.check_type_length(es_testapp, 'AlignedReads', 3) TestMetadataTSVHelper.check_type_length(es_testapp, 'UnalignedReads', 5) TestMetadataTSVHelper.check_type_length(es_testapp, 'VariantCalls', 2) - TestMetadataTSVHelper.check_type_length(es_testapp, 'ReferenceFile', 1) + TestMetadataTSVHelper.check_type_length(es_testapp, 'ReferenceFile', 2) TestMetadataTSVHelper.check_type_length(es_testapp, 'OutputFile', 2) TestMetadataTSVHelper.check_type_length(es_testapp, 'SupplementaryFile', 2) diff --git a/src/encoded/types/gene_annotation.py b/src/encoded/types/gene_annotation.py deleted file mode 100644 index 961f6e5b5..000000000 --- a/src/encoded/types/gene_annotation.py +++ /dev/null @@ -1,21 +0,0 @@ -from snovault import collection, load_schema - -from .base import Item - -def _build_gene_annotation_embedded_list(): - """Embeds for search on gene annotations.""" - return [] - - -@collection( - name="gene-annotations", - unique_key="gene_annotation:identifier", - properties={ - "title": "Gene Annotations", - "description": "Gene annotations for gene and transcript quantification", - }, -) -class GeneAnnotation(Item): - item_type = "gene_annotation" - schema = load_schema("encoded:schemas/gene_annotation.json") - embedded_list = _build_gene_annotation_embedded_list() From 4a60156059c18f6c5d03404dc0c170245b1b217c Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 5 Dec 2024 13:26:51 -0500 Subject: [PATCH 11/17] fix merge --- src/encoded/schemas/output_file.json | 4 ---- src/encoded/schemas/supplementary_file.json | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/encoded/schemas/output_file.json b/src/encoded/schemas/output_file.json index 9d5174356..9ef4237a0 100644 --- a/src/encoded/schemas/output_file.json +++ b/src/encoded/schemas/output_file.json @@ -50,11 +50,7 @@ "$ref": "mixins.json#/file_release" }, { -<<<<<<< HEAD "$ref": "mixins.json#/annotation" -======= - "$ref": "mixins.json#/gene_annotation" ->>>>>>> main }, { "$ref": "mixins.json#/modified" diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json index c10bb7c6a..2d6b04b58 100644 --- a/src/encoded/schemas/supplementary_file.json +++ b/src/encoded/schemas/supplementary_file.json @@ -36,11 +36,7 @@ "$ref": "mixins.json#/file_release" }, { -<<<<<<< HEAD "$ref": "mixins.json#/annotation" -======= - "$ref": "mixins.json#/gene_annotation" ->>>>>>> main }, { "$ref": "mixins.json#/modified" From 09db0746934c5c250195ac234d985c63833b54fb Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 5 Dec 2024 14:05:39 -0500 Subject: [PATCH 12/17] take annotation off supplementary file --- src/encoded/schemas/supplementary_file.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json index 2d6b04b58..a8bce5edf 100644 --- a/src/encoded/schemas/supplementary_file.json +++ b/src/encoded/schemas/supplementary_file.json @@ -35,9 +35,6 @@ { "$ref": "mixins.json#/file_release" }, - { - "$ref": "mixins.json#/annotation" - }, { "$ref": "mixins.json#/modified" }, From 4a9f84e02b2e03ed4b28d856ab57615ccf210dc1 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Fri, 6 Dec 2024 11:12:57 -0500 Subject: [PATCH 13/17] Update release_file and make code non-unique --- src/encoded/commands/release_file.py | 14 ++++++++++++++ src/encoded/item_utils/constants/file.py | 1 + src/encoded/schemas/reference_file.json | 8 ++++++++ src/encoded/schemas/software.json | 3 ++- 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/encoded/commands/release_file.py b/src/encoded/commands/release_file.py index 49c9666e2..0ceebf4ea 100644 --- a/src/encoded/commands/release_file.py +++ b/src/encoded/commands/release_file.py @@ -454,6 +454,9 @@ def get_access_status(self, dataset: str) -> str: file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( file_constants.ACCESS_STATUS_OPEN ), + file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: ( + file_constants.ACCESS_STATUS_OPEN + ) }, IPSC: { file_constants.DATA_CATEGORY_SEQUENCING_READS: ( @@ -465,6 +468,9 @@ def get_access_status(self, dataset: str) -> str: file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( file_constants.ACCESS_STATUS_PROTECTED ), + file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: ( + file_constants.ACCESS_STATUS_OPEN + ) }, self.TISSUE: { file_constants.DATA_CATEGORY_SEQUENCING_READS: ( @@ -476,12 +482,19 @@ def get_access_status(self, dataset: str) -> str: file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( file_constants.ACCESS_STATUS_OPEN ), + file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: ( + file_constants.ACCESS_STATUS_OPEN + ) }, } if dataset in [ "colo829bl", "colo829t", "colo829blt_50to1", + "colo829blt_in_silico", + "colo829_snv_indel_challenge_data", + "hapmap_snv_indel_challenge_data", + "mei_detection_challenge_data", "hapmap", "hg002", "hg00438", @@ -498,6 +511,7 @@ def get_access_status(self, dataset: str) -> str: "lb_ipsc_4", "lb_ipsc_52", "lb_ipsc_60", + "ipsc_snv_indel_challenge_data", ]: dataset_category = IPSC elif dataset == self.TISSUE: diff --git a/src/encoded/item_utils/constants/file.py b/src/encoded/item_utils/constants/file.py index 76374401e..054199cb7 100644 --- a/src/encoded/item_utils/constants/file.py +++ b/src/encoded/item_utils/constants/file.py @@ -6,6 +6,7 @@ DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls" DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads" DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls" +DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification" DATASET = "dataset" EXTRA_FILES = "extra_files" FILE_SETS = "file_sets" diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json index 55fb22c14..0c64abd1c 100644 --- a/src/encoded/schemas/reference_file.json +++ b/src/encoded/schemas/reference_file.json @@ -78,6 +78,14 @@ "properties": { "schema_version": { "default": "2" + }, + "code": { + "pattern": "^[A-Za-z0-9_]{2,}$", + "uniqueKey": false + }, + "version": { + "description": "The version of the reference file", + "pattern": "[A-Za-z0-9._-]+" } } } diff --git a/src/encoded/schemas/software.json b/src/encoded/schemas/software.json index 1bd60a0f1..9ac40155d 100644 --- a/src/encoded/schemas/software.json +++ b/src/encoded/schemas/software.json @@ -125,7 +125,8 @@ } }, "code": { - "pattern": "^[A-Za-z0-9_]{2,}$" + "pattern": "^[A-Za-z0-9_]{2,}$", + "uniqueKey": false }, "submitted_id": { "pattern": "^[A-Z0-9]{3,}_SOFTWARE_[A-Z0-9-_.]{4,}$", From e020705baae82f8005afe5b5dabfb938f00bd8b5 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Fri, 6 Dec 2024 11:39:35 -0500 Subject: [PATCH 14/17] Make code separate from mixins --- .../commands/create_annotated_filenames.py | 52 +++++++++++++++---- src/encoded/schemas/reference_file.json | 10 ++-- src/encoded/schemas/software.json | 10 ++-- .../data/workbook-inserts/output_file.json | 2 +- .../data/workbook-inserts/reference_file.json | 3 +- .../tests/data/workbook-inserts/software.json | 14 +++++ src/encoded/tests/test_annotated_filename.py | 12 ++--- 7 files changed, 76 insertions(+), 27 deletions(-) diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py index 20277e337..9ecbfb005 100644 --- a/src/encoded/commands/create_annotated_filenames.py +++ b/src/encoded/commands/create_annotated_filenames.py @@ -827,7 +827,7 @@ def get_analysis( """ software_and_versions = get_software_and_versions(software) reference_genome_code = item_utils.get_code(reference_genome) - gene_annotation_code = get_gene_annotation_codes(gene_annotations) + gene_annotation_code = get_annotations_and_versions(gene_annotations) transcript_info_code = get_rna_seq_tsv_value(file, file_extension) value = get_analysis_value( software_and_versions, @@ -894,26 +894,60 @@ def get_analysis_value( return ANALYSIS_INFO_SEPARATOR.join(to_write) -def get_gene_annotation_codes(gene_annotations: List[Dict[str, Any]]) -> str: - """Get gene annotation codes for file. +def get_annotations_and_versions(gene_annotations: List[Dict[str, Any]]) -> str: + """Get gene annotation codes and accompanying versions for file. + + Currently only looking for items with codes, as these are + expected to be the annotations used for naming. """ - codes = [item for item in gene_annotations if item_utils.get_code(item)] - if not codes: + annotations_with_codes = get_annotations_with_codes(gene_annotations) + if not annotations_with_codes: return "" - return get_gene_annotation_codes_string(codes) + annotations_with_codes_and_versions = get_annotations_with_versions(annotations_with_codes) + if len(annotations_with_codes) == len(annotations_with_codes_and_versions): + return get_annotations_and_versions_string(annotations_with_codes_and_versions) + missing_versions = get_annotation_codes_missing_versions(annotations_with_codes) + logger.warning(f"Missing versions for annotation items: {missing_versions}.") + return "" + + +def get_annotations_with_codes( + annotation_items: List[Dict[str, Any]] +) -> List[Dict[str, Any]]: + """Get annotation reference file items with codes.""" + return [item for item in annotation_items if item_utils.get_code(item)] -def get_gene_annotation_codes_string(annotation_items: List[Dict[str, Any]]) -> str: - """Get string representation of gene annotation codes.""" +def get_annotations_with_versions( + annotation_items: List[Dict[str, Any]] +) -> List[Dict[str, Any]]: + """Get annotation reference file items with versions.""" + return [item for item in annotation_items if item_utils.get_version(item)] + + +def get_annotations_and_versions_string(annotation_items: List[Dict[str, Any]]) -> str: + """Get string representation of annotation code and versions.""" sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code) return ANALYSIS_INFO_SEPARATOR.join( [ - item_utils.get_code(item) + f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}" + f"{item_utils.get_version(item)}" for item in sorted_annotation_items ] ) +def get_annotation_codes_missing_versions( + annotation_items: List[Dict[str, Any]] +) -> List[str]: + """Get annotation reference file items missing versions.""" + return [ + item_utils.get_code(item) + for item in annotation_items + if not item_utils.get_version(item) + ] + + def get_software_and_versions(software: List[Dict[str, Any]]) -> str: """Get software and accompanying versions for file. diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json index 0c64abd1c..a2065a894 100644 --- a/src/encoded/schemas/reference_file.json +++ b/src/encoded/schemas/reference_file.json @@ -37,9 +37,6 @@ { "$ref": "mixins.json#/attribution" }, - { - "$ref": "mixins.json#/code" - }, { "$ref": "mixins.json#/description" }, @@ -80,8 +77,11 @@ "default": "2" }, "code": { - "pattern": "^[A-Za-z0-9_]{2,}$", - "uniqueKey": false + "title": "Code", + "description": "Code used in file naming scheme", + "type": "string", + "permission": "restricted_fields", + "pattern": "^[A-Za-z0-9_]{2,}$" }, "version": { "description": "The version of the reference file", diff --git a/src/encoded/schemas/software.json b/src/encoded/schemas/software.json index 9ac40155d..074a739d8 100644 --- a/src/encoded/schemas/software.json +++ b/src/encoded/schemas/software.json @@ -59,9 +59,6 @@ { "$ref": "mixins.json#/category" }, - { - "$ref": "mixins.json#/code" - }, { "$ref": "mixins.json#/description" }, @@ -125,8 +122,11 @@ } }, "code": { - "pattern": "^[A-Za-z0-9_]{2,}$", - "uniqueKey": false + "title": "Code", + "description": "Code used in file naming scheme", + "type": "string", + "permission": "restricted_fields", + "pattern": "^[A-Za-z0-9_]{2,}$" }, "submitted_id": { "pattern": "^[A-Z0-9]{3,}_SOFTWARE_[A-Z0-9-_.]{4,}$", diff --git a/src/encoded/tests/data/workbook-inserts/output_file.json b/src/encoded/tests/data/workbook-inserts/output_file.json index 1aa9a7af9..dbec34abc 100644 --- a/src/encoded/tests/data/workbook-inserts/output_file.json +++ b/src/encoded/tests/data/workbook-inserts/output_file.json @@ -75,6 +75,6 @@ ], "status": "released", "dataset": "colo829t", - "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam" + "annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode_v45.aligned.sorted.phased.bam" } ] diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json index f89f0be2a..ba0330282 100644 --- a/src/encoded/tests/data/workbook-inserts/reference_file.json +++ b/src/encoded/tests/data/workbook-inserts/reference_file.json @@ -27,7 +27,8 @@ "aliases": [ "smaht:ReferenceFile-collapsed-genes-gencode_v45" ], - "code": "gencode45", + "code": "gencode", + "version": "v45", "data_type": [ "Gene Model" ], diff --git a/src/encoded/tests/data/workbook-inserts/software.json b/src/encoded/tests/data/workbook-inserts/software.json index e75a29534..6de034d0d 100644 --- a/src/encoded/tests/data/workbook-inserts/software.json +++ b/src/encoded/tests/data/workbook-inserts/software.json @@ -27,6 +27,20 @@ "code": "strelka", "version": "3.1.1" }, + { + "uuid": "0e6ee3a4-2831-4ee4-b648-f53808282f38", + "submission_centers": [ + "smaht" + ], + "submitted_id": "TEST_SOFTWARE_BWA-MEM_2.0.0", + "category": [ + "Alignment" + ], + "name": "bwa_mem_v2", + "title": "BWA-MEM", + "code": "bwamem", + "version": "2.0.0" + }, { "uuid": "be085e03-0989-4b44-81af-37efef5aa086", "submission_centers": [ diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py index a6574a810..398c12845 100644 --- a/src/encoded/tests/test_annotated_filename.py +++ b/src/encoded/tests/test_annotated_filename.py @@ -527,10 +527,10 @@ def test_get_sequencing_center_code( ANOTHER_SOFTWARE = {"code": ANOTHER_SOFTWARE_CODE, "version": ANOTHER_SOFTWARE_VERSION} REFERENCE_GENOME_CODE = "GRCh38" TARGET_GENOME_CODE = "HELA_DSA" -GENE_ANNOTATION_CODE = "gencode45" - +GENE_ANNOTATION_CODE = "gencode" +GENE_ANNOTATION_VERSION = "v45" SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE} -SOME_GENE_ANNOTATION = [{"code": GENE_ANNOTATION_CODE}] +SOME_GENE_ANNOTATION = [{"code": GENE_ANNOTATION_CODE, "version": GENE_ANNOTATION_VERSION}] SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]} SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]} RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]} @@ -646,7 +646,7 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, TSV_FILE_EXTENSION, - f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_gene", + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}_gene", False ), ( @@ -655,7 +655,7 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, TSV_FILE_EXTENSION, - f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_isoform", + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}_isoform", False ), ( @@ -673,7 +673,7 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, SOME_FILE_EXTENSION, - f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}", + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}", False ), ( From ea6e7d39f78c465d3dc11f9562b1c2d326e69322 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Fri, 6 Dec 2024 14:55:34 -0500 Subject: [PATCH 15/17] Add title to reference file --- src/encoded/schemas/reference_file.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/encoded/schemas/reference_file.json b/src/encoded/schemas/reference_file.json index a2065a894..4c652cae3 100644 --- a/src/encoded/schemas/reference_file.json +++ b/src/encoded/schemas/reference_file.json @@ -52,6 +52,9 @@ { "$ref": "mixins.json#/tags" }, + { + "$ref": "mixins.json#/title" + }, { "$ref": "mixins.json#/url" }, From 9c20c23038c702e4b338f57aab5f6bc7c101e704 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 12 Dec 2024 10:48:04 -0500 Subject: [PATCH 16/17] embed reference file title, code, and version --- src/encoded/schemas/mixins.json | 2 +- src/encoded/types/output_file.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/encoded/schemas/mixins.json b/src/encoded/schemas/mixins.json index 9574b1af9..ad9e48741 100644 --- a/src/encoded/schemas/mixins.json +++ b/src/encoded/schemas/mixins.json @@ -77,7 +77,7 @@ "annotation": { "annotation": { "title": "Gene Annotation", - "description": "Gene annotation used for gene or transcript quantification", + "description": "Gene annotation used for analysis", "type": "array", "items": { "type": "string", diff --git a/src/encoded/types/output_file.py b/src/encoded/types/output_file.py index 601e603ce..5fdd38087 100644 --- a/src/encoded/types/output_file.py +++ b/src/encoded/types/output_file.py @@ -4,6 +4,14 @@ from .file import File +def _build_output_file_embedded_list(): + """Embeds for search on cell cultures.""" + return File.embedded_list + [ + "annotation.code", + "annotation.version", + "annotation.title", + ] + @collection( name="output-files", acl=ONLY_ADMIN_VIEW_ACL, @@ -15,7 +23,7 @@ class OutputFile(File): item_type = "output_file" schema = load_schema("encoded:schemas/output_file.json") - embedded_list = File.embedded_list + embedded_list = _build_output_file_embedded_list() # processed files don't want md5 as unique key def unique_keys(self, properties): From 5c0db5116ea78287475d78fe67be29c82f8ecda9 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 12 Dec 2024 12:51:22 -0500 Subject: [PATCH 17/17] Add title to reference file insert --- src/encoded/tests/data/workbook-inserts/reference_file.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/encoded/tests/data/workbook-inserts/reference_file.json b/src/encoded/tests/data/workbook-inserts/reference_file.json index ba0330282..070e4a4b0 100644 --- a/src/encoded/tests/data/workbook-inserts/reference_file.json +++ b/src/encoded/tests/data/workbook-inserts/reference_file.json @@ -28,6 +28,7 @@ "smaht:ReferenceFile-collapsed-genes-gencode_v45" ], "code": "gencode", + "title": "GENCODEv45", "version": "v45", "data_type": [ "Gene Model"