Merge pull request #181 from GavinHuttley/develop

MAINT: handle null dates on mysql import
cogent3 · Jan 22, 2025 · 327a03b · 327a03b
2 parents fa552e2 + 6929160
commit 327a03b
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 13 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,6 +61,7 @@ test = [
     "pandas",
     "pytest",
     "pytest-cov",
+    "pytest-timeout",
     "pytest-xdist",
     "ruff==0.9.1",
     "nox"]

diff --git a/src/ensembl_tui/_ingest_annotation.py b/src/ensembl_tui/_ingest_annotation.py
@@ -70,16 +70,44 @@ def migrate_schema(con: duckdb.DuckDBPyConnection, table_name: str) -> None:
     3 possible values (-1, 0, 1). So we explicitly set types for all columns
     whose name ends in "strand".
     """
-    sql = f"CREATE TABLE {table_name} AS SELECT * FROM mysqldb.{table_name} LIMIT 0"
-    con.sql(sql)
-    # assume any column name ending with "strand" is a smallint (-1, 0, 1) for
-    # minus, unknown, plus strand
-    names = con.sql(f"DESCRIBE {table_name}").to_df()
-    for n in names["column_name"].to_list():
-        if n.endswith("strand"):
+    sql = f"""
+    SELECT column_name, data_type
+    FROM information_schema.columns
+    WHERE table_name = '{table_name}'
+    AND column_name LIKE '%strand'"""
+    names = con.sql(sql).to_df()
+    names_types = zip(
+        names["column_name"].to_list(),
+        names["data_type"].to_list(),
+        strict=False,
+    )
+    for n, t in names_types:
+        if n.endswith("strand") and t == "BOOL":
+            # assume any column name ending with "strand" is a
+            # smallint (-1, 0, 1) for minus, unknown, plus strand
             sql = f"ALTER TABLE {table_name} ALTER COLUMN {n} SET DATA TYPE TINYINT;"
             con.sql(sql)
 
+    # change all timestamp columns to text
+    # this is required because duckdb import does not handle null timestamps
+    # like '0000-00-00 00:00:00'
+    sql = f"""
+    SELECT column_name, data_type
+    FROM information_schema.columns
+    WHERE table_name = '{table_name}'
+    AND data_type = 'TIMESTAMP';
+    """
+    names = con.sql(sql).to_df()
+    names_types = zip(
+        names["column_name"].to_list(),
+        names["data_type"].to_list(),
+        strict=False,
+    )
+    for n, t in names_types:
+        if t == "TIMESTAMP":
+            sql = f"ALTER TABLE {table_name} ALTER COLUMN {n} SET DATA TYPE TEXT;"
+            con.sql(sql)
+
 
 def make_mysql_connection(
     *,
@@ -128,12 +156,14 @@ def make_table_template(
         db_user=db_user,
         db_path=outname,
     )
-    migrate_schema(con, table_name)
+    # we load the raw mysql schema from the ensembl mysql server
+    sql = f"CREATE TABLE {table_name} AS SELECT * FROM mysqldb.{table_name} LIMIT 0"
+    con.sql(sql)
     con.close()
     return outname
 
 
-def import_mysql_table(
+def import_mysqldump(
     *,
     con: duckdb.DuckDBPyConnection,
     mysql_dump_path: pathlib.Path,
@@ -153,6 +183,7 @@ def import_mysql_table(
     fix_start
         if True, columns ending in "_start" are adjusted from 1-based to 0-based
     """
+    migrate_schema(con, table_name)
     sql = f"INSERT INTO {table_name} SELECT * FROM read_csv_auto('{mysql_dump_path}', nullstr='\\N', header=false, ignore_errors=false, delim='\\t')"
     con.sql(sql)
     if fix_start:
@@ -205,7 +236,7 @@ def write_parquet(
     """
     dest_dir.mkdir(parents=True, exist_ok=True)
     with tempdb(db_templates / f"{table_name}.duckdb") as con:
-        import_mysql_table(
+        import_mysqldump(
             con=con,
             mysql_dump_path=dump_path,
             table_name=table_name,

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -13,6 +13,7 @@
 
 @pytest.mark.slow
 @pytest.mark.internet
+@pytest.mark.timeout(120)
 def test_download(tmp_config):
     """runs download, install, drop according to a special test cfg"""
     tmp_dir = tmp_config.parent

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -135,6 +135,7 @@ def cfg_just_genomes(empty_cfg):
 
 
 @pytest.mark.internet
+@pytest.mark.timeout(10)
 def test_read_config_compara_genomes(cfg_just_aligns):
     from ensembl_tui._species import Species
 
@@ -151,6 +152,7 @@ def test_read_config_compara_genomes(cfg_just_aligns):
 
 
 @pytest.mark.internet
+@pytest.mark.timeout(10)
 def test_read_config_genomes(cfg_just_genomes):
     from ensembl_tui._species import Species
 

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -6,6 +6,7 @@
 
 
 @pytest.mark.internet
+@pytest.mark.timeout(10)
 def test_get_db_names(tmp_config):
     cfg = eti_config.read_config(tmp_config)
     db_names = eti_download.get_core_db_dirnames(cfg)

diff --git a/tests/test_install.py b/tests/test_install.py
@@ -121,8 +121,8 @@ def tsv_with_start_cols(tmp_path):
     return tsv_path
 
 
-def test_import_mysql_table(db_with_start_columns, tsv_with_start_cols):
-    eti_db_ingest.import_mysql_table(
+def test_import_mysqldump(db_with_start_columns, tsv_with_start_cols):
+    eti_db_ingest.import_mysqldump(
         con=db_with_start_columns,
         mysql_dump_path=tsv_with_start_cols,
         table_name="demo_table",

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -92,7 +92,7 @@ def test_valid_seq(name):
     assert valid_seq_file(name)
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture
 def just_compara_cfg(tmp_config):
     # no genomes!
     parser = ConfigParser()
@@ -108,6 +108,7 @@ def just_compara_cfg(tmp_config):
 
 
 @pytest.mark.internet
+@pytest.mark.timeout(10)
 def test_just_compara(just_compara_cfg):
     # get species names from the alignment ref tree
     cfg = eti_config.read_config(just_compara_cfg)
@@ -175,6 +176,7 @@ def test_config_update_species(tmp_config):
 
 
 @pytest.mark.internet
+@pytest.mark.timeout(10)
 def test_cfg_to_dict(just_compara_cfg):
     cfg = eti_config.read_config(just_compara_cfg)
     data = cfg.to_dict()