Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alternative ways to get genomic positions #150

Merged
merged 21 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,15 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
cache: "pip"
cache-dependency-path: "**/pyproject.toml"
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.r }}
use-public-rspm: true
- name: Install uv
uses: astral-sh/setup-uv@v5

- name: Install test dependencies
run: |
python -m pip install --upgrade pip wheel
- name: Install dependencies
run: |
pip install ".[dev,test,copykat]"
- name: Install the project
run: uv sync --extra dev --extra test --extra gtf --extra copykat

- name: Install R dependencies
run: |
Expand All @@ -73,9 +69,9 @@ jobs:
PLATFORM: ${{ matrix.os }}
DISPLAY: :42
run: |
coverage run -m pytest -v --color=yes
uv run coverage run -m pytest -v --color=yes
- name: Report coverage
run: |
coverage report
uv run coverage report
- name: Upload coverage
uses: codecov/codecov-action@v3
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.DS_Store
*~
buck-out/
.pybiomart.sqlite

# Compiled files
.venv/
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Documentation][badge-docs]][link-docs]
[![PyPI][badge-pypi]][link-pypi]

[badge-tests]: https://img.shields.io/github/actions/workflow/status/icbi-lab/infercnvpy/test.yaml?branch=main
[badge-tests]: https://github.com/icbi-lab/infercnvpy/actions/workflows/test.yaml/badge.svg
[link-tests]: /~https://github.com/icbi-lab/infercnvpy/actions/workflows/test.yml
[badge-docs]: https://img.shields.io/readthedocs/infercnvpy
[badge-pypi]: https://img.shields.io/pypi/v/infercnvpy?logo=PyPI
Expand Down Expand Up @@ -80,5 +80,5 @@ n/a
[scverse-discourse]: https://discourse.scverse.org/
[issue-tracker]: /~https://github.com/icbi-lab/infercnvpy/issues
[changelog]: https://infercnvpy.readthedocs.io/latest/changelog.html
[link-docs]: https://infercnvpy.readthedocs.io
[link-docs]: https://infercnvpy.readthedocs.io/
[link-api]: https://infercnvpy.readthedocs.io/en/latest/api.html
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Input/Output: `io`
.. autosummary::
:toctree: ./generated

genomic_position_from_biomart
genomic_position_from_gtf
read_scevan

Expand Down
16 changes: 10 additions & 6 deletions docs/notebooks/tutorial_3k.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@
"cell_type": "raw",
"id": "respected-outreach",
"metadata": {
"raw_mimetype": "text/restructuredtext"
"raw_mimetype": "text/restructuredtext",
"vscode": {
"languageId": "raw"
}
},
"source": [
".. note::\n",
Expand All @@ -98,8 +101,9 @@
" the start and end positions on that chromosome for each gene, \n",
" respectively. \n",
" \n",
" Infercnvpy provides the :func:`infercnvpy.io.genomic_position_from_gtf` function\n",
" to read these information from a GTF file and add them to `adata.var`. \n",
" Infercnvpy provides the :func:`infercnvpy.io.genomic_position_from_biomart` and \n",
" :func:`infercnvpy.io.genomic_position_from_gtf` functions\n",
" to get these information online or from a GTF file and store them in `adata.var`. \n",
" \n",
"The example dataset is already appropriately preprocessed. "
]
Expand Down Expand Up @@ -1448,9 +1452,9 @@
"notebook_metadata_filter": "-kernelspec"
},
"kernelspec": {
"display_name": "Python [conda env:micromamba-infercnvpy]",
"display_name": ".venv",
"language": "python",
"name": "conda-env-micromamba-infercnvpy-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1462,7 +1466,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down
12 changes: 10 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,24 @@ urls.Source = "/~https://github.com/icbi-lab/infercnvpy"
urls.Home-page = "/~https://github.com/icbi-lab/infercnvpy"
dependencies = [
'anndata>=0.7.3',
"scanpy>=1.9",
"scanpy>=1.10",
'pandas>=1',
'numpy>=1.20', # includes type annotations
'tqdm>=4.63.0', # fixes tqdm.auto
'pytoml',
'gtfparse>=2.1',
'pycairo>=1.20; sys_platform == "win32"',
'leidenalg',
'pyreadr',
'pytest-benchmark',
# for debug logging (referenced from the issue template)
"session-info",
"pybiomart>=0.2.0",
]

[project.optional-dependencies]
gtf = [
'gtfparse>=2.1'
]
copykat = [
'rpy2'
]
Expand All @@ -60,10 +63,12 @@ doc = [
'pycairo',
'jupyter_client',
"pandas",
"setuptools", # required for sphinxcontrib-bibtex
]
test = [
"pytest",
"coverage",
"openpyxl", # required for one of the scanpy datasets used in the tests
]

[tool.hatch.version]
Expand Down Expand Up @@ -155,3 +160,6 @@ skip = [
"docs/references.md",
"docs/notebooks/example.ipynb",
]

[tool.uv.sources]
gtfparse = { git = "/~https://github.com/lrauschning/gtfparse.git", rev = "dev" }
4 changes: 3 additions & 1 deletion src/infercnvpy/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from ._genepos import genomic_position_from_gtf
from ._genepos import genomic_position_from_biomart, genomic_position_from_gtf
from ._scevan import read_scevan

__all__ = ["genomic_position_from_gtf", "genomic_position_from_biomart", "read_scevan"]
100 changes: 98 additions & 2 deletions src/infercnvpy/io/_genepos.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,96 @@
from pathlib import Path
from typing import Literal

import gtfparse
import numpy as np
import pandas as pd
import scanpy.queries
from anndata import AnnData
from scanpy import logging


def genomic_position_from_biomart(
adata: AnnData | None = None,
*,
adata_gene_id: str | None = None,
biomart_gene_id="ensembl_gene_id",
species: str = "hsapiens",
inplace: bool = True,
**kwargs,
):
"""
Get genomic gene positions from ENSEMBL Biomart.

Parameters
----------
adata
Adds the genomic positions to `adata.var`. If adata is None, returns
a data frame with the genomic positions instead.
adata_gene_id
Column in `adata.var` that contains (ENSMBL) gene IDs. If not specified,
use `adata.var_names`.
biomart_gene_id
The biomart column to use as gene identifier. Typically this would be `ensembl_gene_id` or `hgnc_symbol`,
but could be different for other species.
inplace
If True, add the annotations directly to adata, otherwise return a dataframe.
**kwargs
passed on to :func:`scanpy.queries.biomart_annotations`
"""
biomart_annot = (
scanpy.queries.biomart_annotations(
species,
[
biomart_gene_id,
"start_position",
"end_position",
"chromosome_name",
],
**kwargs,
)
.rename(
columns={
"start_position": "start",
"end_position": "end",
"chromosome_name": "chromosome",
}
)
# use chr prefix for chromosome
.assign(chromosome=lambda x: "chr" + x["chromosome"])
)

gene_ids_adata = (adata.var_names if adata_gene_id is None else adata.var[adata_gene_id]).values
missing_from_biomart = len(set(gene_ids_adata) - set(biomart_annot[biomart_gene_id].values))
if missing_from_biomart:
logging.warning(
f"Biomart misses annotation for {missing_from_biomart} genes in adata. Did you use ENSEMBL ids?"
)

duplicated_symbols = np.sum(biomart_annot[biomart_gene_id].duplicated())
if duplicated_symbols:
logging.warning(f"Skipped {duplicated_symbols} genes because of duplicate identifiers in GTF file.")
biomart_annot = biomart_annot.loc[~biomart_annot[biomart_gene_id].duplicated(keep=False), :]

tmp_var = adata.var.copy()
orig_index_name = tmp_var.index.name
TMP_INDEX_NAME = "adata_var_index"
tmp_var.index.name = TMP_INDEX_NAME
tmp_var.reset_index(inplace=True)
var_annotated = tmp_var.merge(
biomart_annot,
how="left",
left_on=TMP_INDEX_NAME if adata_gene_id is None else adata_gene_id,
right_on=biomart_gene_id,
validate="one_to_one",
)
var_annotated.set_index(TMP_INDEX_NAME, inplace=True)
var_annotated.index.name = orig_index_name

if inplace:
adata.var = var_annotated
else:
return var_annotated


def genomic_position_from_gtf(
gtf_file: Path | str,
adata: AnnData | None = None,
Expand All @@ -16,7 +99,8 @@ def genomic_position_from_gtf(
adata_gene_id: str | None = None,
inplace: bool = True,
) -> pd.DataFrame | None:
"""Get genomic gene positions from a GTF file.
"""
Get genomic gene positions from a GTF file.

The GTF file needs to match the genome annotation used for your single cell dataset.

Expand All @@ -38,6 +122,12 @@ def genomic_position_from_gtf(
inplace
If True, add the annotations directly to adata, otherwise return a dataframe.
"""
try:
import gtfparse
except ImportError:
raise ImportError(
"genomic_position_from_gtf requires gtfparse as optional dependency. Please install it using `pip install gtfparse`."
) from None
gtf = gtfparse.read_gtf(
gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"]
).to_pandas()
Expand All @@ -49,6 +139,8 @@ def genomic_position_from_gtf(
.drop_duplicates()
.rename(columns={"seqname": "chromosome"})
)
# remove ensembl versions
gtf["gene_id"] = gtf["gene_id"].str.replace(r"\.\d+$", "", regex=True)

gene_ids_adata = (adata.var_names if adata_gene_id is None else adata.var[adata_gene_id]).values
gtf = gtf.loc[gtf[gtf_gene_id].isin(gene_ids_adata), :]
Expand Down Expand Up @@ -77,6 +169,10 @@ def genomic_position_from_gtf(
var_annotated.set_index(TMP_INDEX_NAME, inplace=True)
var_annotated.index.name = orig_index_name

# if not a gencode GTF, let's add 'chr' prefix:
if np.all(~var_annotated["chromosome"].dropna().str.startswith("chr")):
var_annotated["chromosome"] = "chr" + var_annotated["chromosome"]

if inplace:
adata.var = var_annotated
else:
Expand Down
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import numpy as np
import pandas as pd
import pytest
Expand All @@ -7,6 +9,11 @@
import infercnvpy as cnv


@pytest.fixture()
def testdata():
return Path(__file__).parent / "data"


@pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
def adata_oligodendroma(request):
"""Adata with raw counts in .X parametrized to be either sparse or dense."""
Expand Down
Loading
Loading