Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add option to automatically rename table attributes when matching genes with match_table_attributes #272

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 23 additions & 13 deletions orangecontrib/bioinformatics/ncbi/gene/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def load_attributes(self, values: Tuple[str, ...], attributes: Tuple[str, ...] =
setattr(self, attr, json.loads(val) if attr in ('synonyms', 'db_refs', 'homologs') else val)

def homolog_gene(self, taxonomy_id: str) -> Optional[str]:
""" Returns gene homolog for given organism.
"""Returns gene homolog for given organism.

Parameters
----------
Expand Down Expand Up @@ -95,7 +95,7 @@ def genes(self, genes: List[str]) -> None:
self._match()

def get_known_genes(self) -> List[Gene]:
""" Return Genes with known Entrez ID
"""Return Genes with known Entrez ID

Returns
-------
Expand All @@ -106,7 +106,7 @@ def get_known_genes(self) -> List[Gene]:
return [gene for gene in self.genes if gene.gene_id]

def to_data_table(self, selected_genes: Optional[List[str]] = None) -> Table:
""" Transform GeneMatcher results to Orange data table.
"""Transform GeneMatcher results to Orange data table.

Optionally we can provide a list of genes (Entrez Ids).
The table on the output will be populated only with provided genes.
Expand Down Expand Up @@ -184,7 +184,7 @@ def to_data_table(self, selected_genes: Optional[List[str]] = None) -> Table:
def match_table_column(
self, data_table: Table, column_name: str, target_column: Optional[StringVariable] = None
) -> Table:
""" Helper function for gene name matching with :class:`Orange.data.Table`.
"""Helper function for gene name matching with :class:`Orange.data.Table`.

Give a column of genes, GeneMatcher will try to map genes to their
corresponding Entrez Ids.
Expand Down Expand Up @@ -223,8 +223,8 @@ def match_table_column(

return new_data

def match_table_attributes(self, data_table):
""" Helper function for gene name matching with :class:`Orange.data.Table`.
def match_table_attributes(self, data_table, rename=False, source_name='Source ID') -> Table:
"""Helper function for gene name matching with :class:`Orange.data.Table`.

Match table attributes and if a unique match is found create a new column attribute
for Entrez Id. Attribute name is defined here: `orangecontrib.bioinformatics.ncbi.gene.config.NCBI_ID`
Expand All @@ -237,18 +237,28 @@ def match_table_attributes(self, data_table):

Returns
-------

:class:`Orange.data.Table`
Data table column attributes are populated with Entrez Ids

"""
input_gene_names = [var.name for var in data_table.domain.attributes]

if input_gene_names:
self.genes = input_gene_names
# run gene matcher
self.genes = [var.name for var in data_table.domain.attributes]

def helper(gene, attribute):
if gene.gene_id:
if rename:
attribute = attribute.renamed(gene.symbol)
attribute.attributes[source_name] = gene.input_identifier

attribute.attributes[ENTREZ_ID] = gene.gene_id
return attribute

attributes = [helper(gene, attr) for gene, attr in zip(self.genes, data_table.domain.attributes)]
domain = Domain(attributes, data_table.domain.class_vars, data_table.domain.metas)

for gene in self.genes:
if gene.gene_id:
data_table.domain[gene.input_identifier].attributes[ENTREZ_ID] = gene.gene_id
return data_table.transform(domain)

def match_genes(self):
self._match()
Expand Down Expand Up @@ -299,7 +309,7 @@ def _match(self):

class GeneInfo(dict):
def __init__(self, tax_id: str):
""" Loads genes for given organism in a dict.
"""Loads genes for given organism in a dict.

Each instance of :class:`Gene` is mapped to corresponding Entrez ID

Expand Down
3 changes: 2 additions & 1 deletion orangecontrib/bioinformatics/tests/ncbi/test_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,11 @@ def test_match_table_attributes(self):

data = Table('brown-selected.tab')
data = Table.transpose(data, feature_names_column='gene')
gm.match_table_attributes(data)
data = gm.match_table_attributes(data, rename=True, source_name='FooBar')

for column in data.domain.attributes:
self.assertTrue(ENTREZ_ID in column.attributes)
self.assertTrue('FooBar' in column.attributes)


class TestGeneInfo(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ def set_progress():
state.set_status('Matching genes ...')
tax_id = species_name_to_taxid(species)
gm = GeneMatcher(tax_id)
gm.match_table_attributes(table)
table = gm.match_table_attributes(table, rename=True)
table.attributes[TableAnnotation.tax_id] = tax_id
table.attributes[TableAnnotation.gene_as_attr_name] = True
table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID'
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/bioinformatics/widgets/OWdictyExpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def send_to_output(self, result):
data = gene_matcher.match_table_column(data, 'Gene', StringVariable(ENTREZ_ID))
data.attributes[GENE_ID_COLUMN] = ENTREZ_ID
else:
gene_matcher.match_table_attributes(data)
data = gene_matcher.match_table_attributes(data)
data.attributes[GENE_ID_ATTRIBUTE] = ENTREZ_ID

# add table attributes
Expand Down