Skip to content

Commit

Permalink
SN Rnaseq filenames (#303)
Browse files Browse the repository at this point in the history
* Start with rna-seq annotated filename functions

* Add gene_annotation and update rnaseq filenames

* Fix tests

* commit new files

* update consortia requirement

* Fix annotated filename insert

* remove files from gene annotation

* Merge up-to-date with main

* Add error message for no transcript info

* Make annotation a reference file link

* fix merge

* take annotation off supplementary file

* Update release_file and make code non-unique

* Make code separate from mixins

* Add title to reference file

* embed reference file title, code, and version

* Add title to reference file insert
  • Loading branch information
sarahgonicholson authored Dec 12, 2024
1 parent 411e838 commit 844748e
Show file tree
Hide file tree
Showing 18 changed files with 188 additions and 34 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ smaht-portal
Change Log
----------

0.118.0
=======
`PR 303 SN Rnaseq filenames </~https://github.com/smaht-dac/smaht-portal/pull/303>`

* Make `annotation` link in OutputFile an array of links to ReferenceFile
* Add `code` property to ReferenceFile


0.117.1
=======
`PR 284: Bm nomenclature page3 </~https://github.com/smaht-dac/smaht-portal/pull/284>`_
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "encoded"
version = "0.117.1"
version = "0.118.0"
description = "SMaHT Data Analysis Portal"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down
72 changes: 63 additions & 9 deletions src/encoded/commands/create_annotated_filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class AssociatedItems:
sequencing_center: Dict[str, Any]
software: List[Dict[str, Any]]
reference_genome: Dict[str, Any]
gene_annotation: Dict[str, Any]
gene_annotations: Dict[str, Any]
file_sets: List[Dict[str, Any]]
donor_specific_assembly: Dict[str, Any]
assays: List[Dict[str, Any]]
Expand All @@ -114,7 +114,7 @@ def get_associated_items(
file_format = get_file_format(file, request_handler)
software = get_software(file, request_handler)
reference_genome = get_reference_genome(file, request_handler)
gene_annotation = get_gene_annotation(file, request_handler)
gene_annotations = get_gene_annotations(file, request_handler)
donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
if donor_specific_assembly:
file_sets=get_derived_from_file_sets(file, request_handler)
Expand All @@ -135,7 +135,7 @@ def get_associated_items(
file_format=file_format,
software=software,
reference_genome=reference_genome,
gene_annotation=gene_annotation,
gene_annotations=gene_annotations,
file_sets=file_sets,
donor_specific_assembly=donor_specific_assembly,
assays=assays,
Expand Down Expand Up @@ -215,11 +215,11 @@ def get_reference_genome(
return get_item(file_utils.get_reference_genome(file), request_handler)


def get_gene_annotation(
def get_gene_annotations(
file: Dict[str, Any], request_handler: RequestHandler
) -> Dict[str, Any]:
"""Get gene annotation for file."""
return get_item(file_utils.get_gene_annotation(file), request_handler)
"""Get gene annotations for file."""
return get_items(file_utils.get_annotation(file), request_handler)


def get_software(
Expand Down Expand Up @@ -443,7 +443,7 @@ def get_annotated_filename(
file,
associated_items.software,
associated_items.reference_genome,
associated_items.gene_annotation,
associated_items.gene_annotations,
associated_items.file_format
)
errors = collect_errors(
Expand Down Expand Up @@ -817,7 +817,7 @@ def get_analysis(
file: Dict[str, Any],
software: List[Dict[str, Any]],
reference_genome: Dict[str, Any],
gene_annotation: Dict[str, Any],
gene_annotations: Dict[str, Any],
file_extension: Dict[str, Any],
) -> FilenamePart:
"""Get analysis info for file.
Expand All @@ -827,7 +827,7 @@ def get_analysis(
"""
software_and_versions = get_software_and_versions(software)
reference_genome_code = item_utils.get_code(reference_genome)
gene_annotation_code = item_utils.get_code(gene_annotation)
gene_annotation_code = get_annotations_and_versions(gene_annotations)
transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
value = get_analysis_value(
software_and_versions,
Expand Down Expand Up @@ -894,6 +894,60 @@ def get_analysis_value(
return ANALYSIS_INFO_SEPARATOR.join(to_write)


def get_annotations_and_versions(gene_annotations: List[Dict[str, Any]]) -> str:
"""Get gene annotation codes and accompanying versions for file.
Currently only looking for items with codes, as these are
expected to be the annotations used for naming.
"""
annotations_with_codes = get_annotations_with_codes(gene_annotations)
if not annotations_with_codes:
return ""
annotations_with_codes_and_versions = get_annotations_with_versions(annotations_with_codes)
if len(annotations_with_codes) == len(annotations_with_codes_and_versions):
return get_annotations_and_versions_string(annotations_with_codes_and_versions)
missing_versions = get_annotation_codes_missing_versions(annotations_with_codes)
logger.warning(f"Missing versions for annotation items: {missing_versions}.")
return ""


def get_annotations_with_codes(
annotation_items: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Get annotation reference file items with codes."""
return [item for item in annotation_items if item_utils.get_code(item)]


def get_annotations_with_versions(
annotation_items: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Get annotation reference file items with versions."""
return [item for item in annotation_items if item_utils.get_version(item)]


def get_annotations_and_versions_string(annotation_items: List[Dict[str, Any]]) -> str:
"""Get string representation of annotation code and versions."""
sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code)
return ANALYSIS_INFO_SEPARATOR.join(
[
f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
f"{item_utils.get_version(item)}"
for item in sorted_annotation_items
]
)


def get_annotation_codes_missing_versions(
annotation_items: List[Dict[str, Any]]
) -> List[str]:
"""Get annotation reference file items missing versions."""
return [
item_utils.get_code(item)
for item in annotation_items
if not item_utils.get_version(item)
]


def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
"""Get software and accompanying versions for file.
Expand Down
14 changes: 14 additions & 0 deletions src/encoded/commands/release_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,9 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
},
IPSC: {
file_constants.DATA_CATEGORY_SEQUENCING_READS: (
Expand All @@ -465,6 +468,9 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
},
self.TISSUE: {
file_constants.DATA_CATEGORY_SEQUENCING_READS: (
Expand All @@ -476,12 +482,19 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
},
}
if dataset in [
"colo829bl",
"colo829t",
"colo829blt_50to1",
"colo829blt_in_silico",
"colo829_snv_indel_challenge_data",
"hapmap_snv_indel_challenge_data",
"mei_detection_challenge_data",
"hapmap",
"hg002",
"hg00438",
Expand All @@ -498,6 +511,7 @@ def get_access_status(self, dataset: str) -> str:
"lb_ipsc_4",
"lb_ipsc_52",
"lb_ipsc_60",
"ipsc_snv_indel_challenge_data",
]:
dataset_category = IPSC
elif dataset == self.TISSUE:
Expand Down
1 change: 1 addition & 0 deletions src/encoded/item_utils/constants/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification"
DATASET = "dataset"
EXTRA_FILES = "extra_files"
FILE_SETS = "file_sets"
Expand Down
6 changes: 3 additions & 3 deletions src/encoded/item_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any
return properties.get("reference_genome", "")


def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
"""Get gene annotation from properties."""
return properties.get("gene_annotation", "")
def get_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
"""Get annotation from properties."""
return properties.get("annotation", [])


def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]:
Expand Down
11 changes: 11 additions & 0 deletions src/encoded/schemas/mixins.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@
}
}
},
"annotation": {
"annotation": {
"title": "Gene Annotation",
"description": "Gene annotation used for analysis",
"type": "array",
"items": {
"type": "string",
"linkTo": "ReferenceFile"
}
}
},
"attachment": {
"attachment": {
"title": "Attached File",
Expand Down
2 changes: 1 addition & 1 deletion src/encoded/schemas/output_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"$ref": "mixins.json#/file_release"
},
{
"$ref": "mixins.json#/gene_annotation"
"$ref": "mixins.json#/annotation"
},
{
"$ref": "mixins.json#/modified"
Expand Down
14 changes: 14 additions & 0 deletions src/encoded/schemas/reference_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@
{
"$ref": "mixins.json#/tags"
},
{
"$ref": "mixins.json#/title"
},
{
"$ref": "mixins.json#/url"
},
Expand All @@ -75,6 +78,17 @@
"properties": {
"schema_version": {
"default": "2"
},
"code": {
"title": "Code",
"description": "Code used in file naming scheme",
"type": "string",
"permission": "restricted_fields",
"pattern": "^[A-Za-z0-9_]{2,}$"
},
"version": {
"description": "The version of the reference file",
"pattern": "[A-Za-z0-9._-]+"
}
}
}
7 changes: 4 additions & 3 deletions src/encoded/schemas/software.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@
{
"$ref": "mixins.json#/category"
},
{
"$ref": "mixins.json#/code"
},
{
"$ref": "mixins.json#/description"
},
Expand Down Expand Up @@ -125,6 +122,10 @@
}
},
"code": {
"title": "Code",
"description": "Code used in file naming scheme",
"type": "string",
"permission": "restricted_fields",
"pattern": "^[A-Za-z0-9_]{2,}$"
},
"submitted_id": {
Expand Down
3 changes: 0 additions & 3 deletions src/encoded/schemas/supplementary_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@
{
"$ref": "mixins.json#/file_release"
},
{
"$ref": "mixins.json#/gene_annotation"
},
{
"$ref": "mixins.json#/modified"
},
Expand Down
11 changes: 11 additions & 0 deletions src/encoded/tests/data/workbook-inserts/file_format.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,16 @@
"ReferenceFile",
"SupplementaryFile"
]
},
{
"uuid": "c3e54d5f-647c-4ca2-9b21-5c01caa3f691",
"submission_centers": [
"smaht"
],
"identifier": "GTF",
"standard_file_extension": "gtf",
"valid_item_types": [
"ReferenceFile"
]
}
]
6 changes: 4 additions & 2 deletions src/encoded/tests/data/workbook-inserts/output_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@
"foo:software_vep"
],
"reference_genome": "GRCh38",
"gene_annotation": "gencode45",
"annotation": [
"smaht:ReferenceFile-collapsed-genes-gencode_v45"
],
"status": "released",
"dataset": "colo829t",
"annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam"
"annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode_v45.aligned.sorted.phased.bam"
}
]
19 changes: 19 additions & 0 deletions src/encoded/tests/data/workbook-inserts/reference_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,24 @@
],
"file_size": 8000,
"status": "restricted"
},
{
"uuid": "8fc6f554-59c9-490a-b6dc-86665a0b971d",
"aliases": [
"smaht:ReferenceFile-collapsed-genes-gencode_v45"
],
"code": "gencode",
"title": "GENCODEv45",
"version": "v45",
"data_type": [
"Gene Model"
],
"data_category": [
"Genome Annotation"
],
"file_format": "GTF",
"consortia": [
"smaht"
]
}
]
14 changes: 14 additions & 0 deletions src/encoded/tests/data/workbook-inserts/software.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@
"code": "strelka",
"version": "3.1.1"
},
{
"uuid": "0e6ee3a4-2831-4ee4-b648-f53808282f38",
"submission_centers": [
"smaht"
],
"submitted_id": "TEST_SOFTWARE_BWA-MEM_2.0.0",
"category": [
"Alignment"
],
"name": "bwa_mem_v2",
"title": "BWA-MEM",
"code": "bwamem",
"version": "2.0.0"
},
{
"uuid": "be085e03-0989-4b44-81af-37efef5aa086",
"submission_centers": [
Expand Down
Loading

0 comments on commit 844748e

Please sign in to comment.