Skip to content

Commit

Permalink
Merge pull request #181 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
MAINT: handle null dates on mysql import
  • Loading branch information
GavinHuttley authored Jan 22, 2025
2 parents fa552e2 + 6929160 commit 327a03b
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 13 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ test = [
"pandas",
"pytest",
"pytest-cov",
"pytest-timeout",
"pytest-xdist",
"ruff==0.9.1",
"nox"]
Expand Down
51 changes: 41 additions & 10 deletions src/ensembl_tui/_ingest_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,44 @@ def migrate_schema(con: duckdb.DuckDBPyConnection, table_name: str) -> None:
3 possible values (-1, 0, 1). So we explicitly set types for all columns
whose name ends in "strand".
"""
sql = f"CREATE TABLE {table_name} AS SELECT * FROM mysqldb.{table_name} LIMIT 0"
con.sql(sql)
# assume any column name ending with "strand" is a smallint (-1, 0, 1) for
# minus, unknown, plus strand
names = con.sql(f"DESCRIBE {table_name}").to_df()
for n in names["column_name"].to_list():
if n.endswith("strand"):
sql = f"""
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = '{table_name}'
AND column_name LIKE '%strand'"""
names = con.sql(sql).to_df()
names_types = zip(
names["column_name"].to_list(),
names["data_type"].to_list(),
strict=False,
)
for n, t in names_types:
if n.endswith("strand") and t == "BOOL":
# assume any column name ending with "strand" is a
# smallint (-1, 0, 1) for minus, unknown, plus strand
sql = f"ALTER TABLE {table_name} ALTER COLUMN {n} SET DATA TYPE TINYINT;"
con.sql(sql)

# change all timestamp columns to text
# this is required because duckdb import does not handle null timestamps
# like '0000-00-00 00:00:00'
sql = f"""
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = '{table_name}'
AND data_type = 'TIMESTAMP';
"""
names = con.sql(sql).to_df()
names_types = zip(
names["column_name"].to_list(),
names["data_type"].to_list(),
strict=False,
)
for n, t in names_types:
if t == "TIMESTAMP":
sql = f"ALTER TABLE {table_name} ALTER COLUMN {n} SET DATA TYPE TEXT;"
con.sql(sql)


def make_mysql_connection(
*,
Expand Down Expand Up @@ -128,12 +156,14 @@ def make_table_template(
db_user=db_user,
db_path=outname,
)
migrate_schema(con, table_name)
# we load the raw mysql schema from the ensembl mysql server
sql = f"CREATE TABLE {table_name} AS SELECT * FROM mysqldb.{table_name} LIMIT 0"
con.sql(sql)
con.close()
return outname


def import_mysql_table(
def import_mysqldump(
*,
con: duckdb.DuckDBPyConnection,
mysql_dump_path: pathlib.Path,
Expand All @@ -153,6 +183,7 @@ def import_mysql_table(
fix_start
if True, columns ending in "_start" are adjusted from 1-based to 0-based
"""
migrate_schema(con, table_name)
sql = f"INSERT INTO {table_name} SELECT * FROM read_csv_auto('{mysql_dump_path}', nullstr='\\N', header=false, ignore_errors=false, delim='\\t')"
con.sql(sql)
if fix_start:
Expand Down Expand Up @@ -205,7 +236,7 @@ def write_parquet(
"""
dest_dir.mkdir(parents=True, exist_ok=True)
with tempdb(db_templates / f"{table_name}.duckdb") as con:
import_mysql_table(
import_mysqldump(
con=con,
mysql_dump_path=dump_path,
table_name=table_name,
Expand Down
1 change: 1 addition & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

@pytest.mark.slow
@pytest.mark.internet
@pytest.mark.timeout(120)
def test_download(tmp_config):
"""runs download, install, drop according to a special test cfg"""
tmp_dir = tmp_config.parent
Expand Down
2 changes: 2 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def cfg_just_genomes(empty_cfg):


@pytest.mark.internet
@pytest.mark.timeout(10)
def test_read_config_compara_genomes(cfg_just_aligns):
from ensembl_tui._species import Species

Expand All @@ -151,6 +152,7 @@ def test_read_config_compara_genomes(cfg_just_aligns):


@pytest.mark.internet
@pytest.mark.timeout(10)
def test_read_config_genomes(cfg_just_genomes):
from ensembl_tui._species import Species

Expand Down
1 change: 1 addition & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


@pytest.mark.internet
@pytest.mark.timeout(10)
def test_get_db_names(tmp_config):
cfg = eti_config.read_config(tmp_config)
db_names = eti_download.get_core_db_dirnames(cfg)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def tsv_with_start_cols(tmp_path):
return tsv_path


def test_import_mysql_table(db_with_start_columns, tsv_with_start_cols):
eti_db_ingest.import_mysql_table(
def test_import_mysqldump(db_with_start_columns, tsv_with_start_cols):
eti_db_ingest.import_mysqldump(
con=db_with_start_columns,
mysql_dump_path=tsv_with_start_cols,
table_name="demo_table",
Expand Down
4 changes: 3 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_valid_seq(name):
assert valid_seq_file(name)


@pytest.fixture(scope="function")
@pytest.fixture
def just_compara_cfg(tmp_config):
# no genomes!
parser = ConfigParser()
Expand All @@ -108,6 +108,7 @@ def just_compara_cfg(tmp_config):


@pytest.mark.internet
@pytest.mark.timeout(10)
def test_just_compara(just_compara_cfg):
# get species names from the alignment ref tree
cfg = eti_config.read_config(just_compara_cfg)
Expand Down Expand Up @@ -175,6 +176,7 @@ def test_config_update_species(tmp_config):


@pytest.mark.internet
@pytest.mark.timeout(10)
def test_cfg_to_dict(just_compara_cfg):
cfg = eti_config.read_config(just_compara_cfg)
data = cfg.to_dict()
Expand Down

0 comments on commit 327a03b

Please sign in to comment.