Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SN Rnaseq filenames #303

Merged
merged 22 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
8f9856e
Start with rna-seq annotated filename functions
sarahgonicholson Nov 21, 2024
7fa4fab
Add gene_annotation and update rnaseq filenames
sarahgonicholson Nov 26, 2024
a8a2208
Fix tests
sarahgonicholson Nov 26, 2024
dec92ca
commit new files
sarahgonicholson Nov 26, 2024
cb10cb3
update consortia requirement
sarahgonicholson Nov 26, 2024
5cf2fea
Fix annotated filename insert
sarahgonicholson Nov 26, 2024
de202b1
remove files from gene annotation
sarahgonicholson Dec 2, 2024
64afe91
Merge branch 'main' into sn_rnaseq_filenames
sarahgonicholson Dec 3, 2024
a47704d
Merge up-to-date with main
sarahgonicholson Dec 3, 2024
393b526
Add error message for no transcript info
sarahgonicholson Dec 4, 2024
444993b
Merge up-to-date with main
sarahgonicholson Dec 4, 2024
c275ce4
Make annotation a reference file link
sarahgonicholson Dec 5, 2024
511bd5e
Update changelog
sarahgonicholson Dec 5, 2024
4a60156
fix merge
sarahgonicholson Dec 5, 2024
09db074
take annotation off supplementary file
sarahgonicholson Dec 5, 2024
4a9f84e
Update release_file and make code non-unique
sarahgonicholson Dec 6, 2024
e020705
Make code separate from mixins
sarahgonicholson Dec 6, 2024
3ec09e4
Merge up-to-date with main
sarahgonicholson Dec 6, 2024
ea6e7d3
Add title to reference file
sarahgonicholson Dec 6, 2024
9c20c23
embed reference file title, code, and version
sarahgonicholson Dec 12, 2024
3d63eff
Merge up-to-date with main
sarahgonicholson Dec 12, 2024
5c0db51
Add title to reference file insert
sarahgonicholson Dec 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ smaht-portal
Change Log
----------

0.118.0
=======
`PR 303 SN Rnaseq filenames </~https://github.com/smaht-dac/smaht-portal/pull/303>`

* Make `annotation` link in OutputFile an array of links to ReferenceFile
* Add `code` property to ReferenceFile


0.117.1
=======
`PR 284: Bm nomenclature page3 </~https://github.com/smaht-dac/smaht-portal/pull/284>`_
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "encoded"
version = "0.117.1"
version = "0.118.0"
description = "SMaHT Data Analysis Portal"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down
72 changes: 63 additions & 9 deletions src/encoded/commands/create_annotated_filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class AssociatedItems:
sequencing_center: Dict[str, Any]
software: List[Dict[str, Any]]
reference_genome: Dict[str, Any]
gene_annotation: Dict[str, Any]
gene_annotations: Dict[str, Any]
file_sets: List[Dict[str, Any]]
donor_specific_assembly: Dict[str, Any]
assays: List[Dict[str, Any]]
Expand All @@ -114,7 +114,7 @@ def get_associated_items(
file_format = get_file_format(file, request_handler)
software = get_software(file, request_handler)
reference_genome = get_reference_genome(file, request_handler)
gene_annotation = get_gene_annotation(file, request_handler)
gene_annotations = get_gene_annotations(file, request_handler)
donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
if donor_specific_assembly:
file_sets=get_derived_from_file_sets(file, request_handler)
Expand All @@ -135,7 +135,7 @@ def get_associated_items(
file_format=file_format,
software=software,
reference_genome=reference_genome,
gene_annotation=gene_annotation,
gene_annotations=gene_annotations,
file_sets=file_sets,
donor_specific_assembly=donor_specific_assembly,
assays=assays,
Expand Down Expand Up @@ -215,11 +215,11 @@ def get_reference_genome(
return get_item(file_utils.get_reference_genome(file), request_handler)


def get_gene_annotation(
def get_gene_annotations(
file: Dict[str, Any], request_handler: RequestHandler
) -> Dict[str, Any]:
"""Get gene annotation for file."""
return get_item(file_utils.get_gene_annotation(file), request_handler)
"""Get gene annotations for file."""
return get_items(file_utils.get_annotation(file), request_handler)


def get_software(
Expand Down Expand Up @@ -443,7 +443,7 @@ def get_annotated_filename(
file,
associated_items.software,
associated_items.reference_genome,
associated_items.gene_annotation,
associated_items.gene_annotations,
associated_items.file_format
)
errors = collect_errors(
Expand Down Expand Up @@ -817,7 +817,7 @@ def get_analysis(
file: Dict[str, Any],
software: List[Dict[str, Any]],
reference_genome: Dict[str, Any],
gene_annotation: Dict[str, Any],
gene_annotations: Dict[str, Any],
file_extension: Dict[str, Any],
) -> FilenamePart:
"""Get analysis info for file.
Expand All @@ -827,7 +827,7 @@ def get_analysis(
"""
software_and_versions = get_software_and_versions(software)
reference_genome_code = item_utils.get_code(reference_genome)
gene_annotation_code = item_utils.get_code(gene_annotation)
gene_annotation_code = get_annotations_and_versions(gene_annotations)
transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
value = get_analysis_value(
software_and_versions,
Expand Down Expand Up @@ -894,6 +894,60 @@ def get_analysis_value(
return ANALYSIS_INFO_SEPARATOR.join(to_write)


def get_annotations_and_versions(gene_annotations: List[Dict[str, Any]]) -> str:
"""Get gene annotation codes and accompanying versions for file.

Currently only looking for items with codes, as these are
expected to be the annotations used for naming.
"""
annotations_with_codes = get_annotations_with_codes(gene_annotations)
if not annotations_with_codes:
return ""
annotations_with_codes_and_versions = get_annotations_with_versions(annotations_with_codes)
if len(annotations_with_codes) == len(annotations_with_codes_and_versions):
return get_annotations_and_versions_string(annotations_with_codes_and_versions)
missing_versions = get_annotation_codes_missing_versions(annotations_with_codes)
logger.warning(f"Missing versions for annotation items: {missing_versions}.")
return ""


def get_annotations_with_codes(
annotation_items: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Get annotation reference file items with codes."""
return [item for item in annotation_items if item_utils.get_code(item)]


def get_annotations_with_versions(
annotation_items: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Get annotation reference file items with versions."""
return [item for item in annotation_items if item_utils.get_version(item)]


def get_annotations_and_versions_string(annotation_items: List[Dict[str, Any]]) -> str:
"""Get string representation of annotation code and versions."""
sorted_annotation_items = sorted(annotation_items, key=item_utils.get_code)
return ANALYSIS_INFO_SEPARATOR.join(
[
f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
f"{item_utils.get_version(item)}"
for item in sorted_annotation_items
]
)


def get_annotation_codes_missing_versions(
annotation_items: List[Dict[str, Any]]
) -> List[str]:
"""Get annotation reference file items missing versions."""
return [
item_utils.get_code(item)
for item in annotation_items
if not item_utils.get_version(item)
]


def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
"""Get software and accompanying versions for file.

Expand Down
14 changes: 14 additions & 0 deletions src/encoded/commands/release_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,9 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
},
IPSC: {
file_constants.DATA_CATEGORY_SEQUENCING_READS: (
Expand All @@ -465,6 +468,9 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
},
self.TISSUE: {
file_constants.DATA_CATEGORY_SEQUENCING_READS: (
Expand All @@ -476,12 +482,19 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
},
}
if dataset in [
"colo829bl",
"colo829t",
"colo829blt_50to1",
"colo829blt_in_silico",
"colo829_snv_indel_challenge_data",
"hapmap_snv_indel_challenge_data",
"mei_detection_challenge_data",
"hapmap",
"hg002",
"hg00438",
Expand All @@ -498,6 +511,7 @@ def get_access_status(self, dataset: str) -> str:
"lb_ipsc_4",
"lb_ipsc_52",
"lb_ipsc_60",
"ipsc_snv_indel_challenge_data",
]:
dataset_category = IPSC
elif dataset == self.TISSUE:
Expand Down
1 change: 1 addition & 0 deletions src/encoded/item_utils/constants/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification"
DATASET = "dataset"
EXTRA_FILES = "extra_files"
FILE_SETS = "file_sets"
Expand Down
6 changes: 3 additions & 3 deletions src/encoded/item_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any
return properties.get("reference_genome", "")


def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
"""Get gene annotation from properties."""
return properties.get("gene_annotation", "")
def get_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
"""Get annotation from properties."""
return properties.get("annotation", [])


def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]:
Expand Down
11 changes: 11 additions & 0 deletions src/encoded/schemas/mixins.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@
}
}
},
"annotation": {
"annotation": {
"title": "Gene Annotation",
"description": "Gene annotation used for analysis",
"type": "array",
"items": {
"type": "string",
"linkTo": "ReferenceFile"
}
}
},
"attachment": {
"attachment": {
"title": "Attached File",
Expand Down
2 changes: 1 addition & 1 deletion src/encoded/schemas/output_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"$ref": "mixins.json#/file_release"
},
{
"$ref": "mixins.json#/gene_annotation"
"$ref": "mixins.json#/annotation"
},
{
"$ref": "mixins.json#/modified"
Expand Down
14 changes: 14 additions & 0 deletions src/encoded/schemas/reference_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@
{
"$ref": "mixins.json#/tags"
},
{
"$ref": "mixins.json#/title"
},
{
"$ref": "mixins.json#/url"
},
Expand All @@ -75,6 +78,17 @@
"properties": {
"schema_version": {
"default": "2"
},
"code": {
"title": "Code",
"description": "Code used in file naming scheme",
"type": "string",
"permission": "restricted_fields",
"pattern": "^[A-Za-z0-9_]{2,}$"
},
"version": {
"description": "The version of the reference file",
"pattern": "[A-Za-z0-9._-]+"
}
}
}
7 changes: 4 additions & 3 deletions src/encoded/schemas/software.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@
{
"$ref": "mixins.json#/category"
},
{
"$ref": "mixins.json#/code"
},
{
"$ref": "mixins.json#/description"
},
Expand Down Expand Up @@ -125,6 +122,10 @@
}
},
"code": {
"title": "Code",
"description": "Code used in file naming scheme",
"type": "string",
"permission": "restricted_fields",
"pattern": "^[A-Za-z0-9_]{2,}$"
},
"submitted_id": {
Expand Down
3 changes: 0 additions & 3 deletions src/encoded/schemas/supplementary_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@
{
"$ref": "mixins.json#/file_release"
},
{
"$ref": "mixins.json#/gene_annotation"
},
{
"$ref": "mixins.json#/modified"
},
Expand Down
11 changes: 11 additions & 0 deletions src/encoded/tests/data/workbook-inserts/file_format.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,16 @@
"ReferenceFile",
"SupplementaryFile"
]
},
{
"uuid": "c3e54d5f-647c-4ca2-9b21-5c01caa3f691",
"submission_centers": [
"smaht"
],
"identifier": "GTF",
"standard_file_extension": "gtf",
"valid_item_types": [
"ReferenceFile"
]
}
]
6 changes: 4 additions & 2 deletions src/encoded/tests/data/workbook-inserts/output_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@
"foo:software_vep"
],
"reference_genome": "GRCh38",
"gene_annotation": "gencode45",
"annotation": [
"smaht:ReferenceFile-collapsed-genes-gencode_v45"
],
"status": "released",
"dataset": "colo829t",
"annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode45.aligned.sorted.phased.bam"
"annotated_filename": "ST001-1D-XX-M45-B004-test-SMAFI8LOZ6MU-bwamem_1.2.3_GRCh38_gencode_v45.aligned.sorted.phased.bam"
}
]
19 changes: 19 additions & 0 deletions src/encoded/tests/data/workbook-inserts/reference_file.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,24 @@
],
"file_size": 8000,
"status": "restricted"
},
{
"uuid": "8fc6f554-59c9-490a-b6dc-86665a0b971d",
"aliases": [
"smaht:ReferenceFile-collapsed-genes-gencode_v45"
],
"code": "gencode",
"title": "GENCODEv45",
"version": "v45",
"data_type": [
"Gene Model"
],
"data_category": [
"Genome Annotation"
],
"file_format": "GTF",
"consortia": [
"smaht"
]
}
]
14 changes: 14 additions & 0 deletions src/encoded/tests/data/workbook-inserts/software.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@
"code": "strelka",
"version": "3.1.1"
},
{
"uuid": "0e6ee3a4-2831-4ee4-b648-f53808282f38",
"submission_centers": [
"smaht"
],
"submitted_id": "TEST_SOFTWARE_BWA-MEM_2.0.0",
"category": [
"Alignment"
],
"name": "bwa_mem_v2",
"title": "BWA-MEM",
"code": "bwamem",
"version": "2.0.0"
},
{
"uuid": "be085e03-0989-4b44-81af-37efef5aa086",
"submission_centers": [
Expand Down
Loading
Loading