v.db.univar: Add JSON output (OSGeo#2386)

* Add JSON output to db.univar. * Add JSON from db.univar to v.db.univar. * All formats now handled through the format option. * Output percentiles as two lists, not a mapping. Tests: * Old tests fixed. * New tests are using pytest. * Fixed values for test obtained from the plain output, so the new JSON output is checked to fit with the original. * The tests with computation using NumPy would fail with different tests data, i.e., using n other than 10, fails the tests.
ninsbl · Feb 17, 2023 · e3d1676 · e3d1676
1 parent 7dc3295
commit e3d1676
Show file tree

Hide file tree

Showing 7 changed files with 384 additions and 34 deletions.
diff --git a/scripts/db.univar/db.univar.py b/scripts/db.univar/db.univar.py
@@ -46,6 +46,14 @@
 # % options: 0-100
 # % multiple: yes
 # %end
+# %option
+# % key: format
+# % type: string
+# % multiple: no
+# % options: plain,json,shell
+# % label: Output format
+# % descriptions: plain;Plain text output;json;JSON (JavaScript Object Notation);shell;Shell script style for Bash eval
+# %end
 # %flag
 # % key: e
 # % description: Extended statistics (quartiles and 90th percentile)
@@ -57,6 +65,7 @@
 
 import sys
 import atexit
+import json
 import math
 
 import grass.script as gscript
@@ -88,6 +97,11 @@ def sortfile(infile, outfile):
 
 
 def main():
+    # A more substantial rewrite of the code is needed, possibly to C or
+    # using Python packages such as statistics or NumPy,
+    # so ignoring the duplication of final computation of some statistics
+    # as well as pushing the limit of how long the function can be.
+    # pylint: disable=too-many-branches
     global tmp
     tmp = gscript.tempfile()
 
@@ -99,9 +113,20 @@ def main():
     driver = options["driver"]
     where = options["where"]
     perc = options["percentile"]
+    output_format = options["format"]
 
     perc = [float(p) for p in perc.split(",")]
 
+    if not output_format:
+        if shellstyle:
+            output_format = "shell"
+        else:
+            output_format = "plain"
+    elif shellstyle:
+        # This can be a message or warning in future versions.
+        # In version 9, -g may be removed.
+        gscript.verbose(_("The format option is used and -g flag ignored"))
+
     desc_table = gscript.db_describe(table, database=database, driver=driver)
     if not desc_table:
         gscript.fatal(_("Unable to describe table <%s>") % table)
@@ -114,7 +139,7 @@ def main():
     if not found:
         gscript.fatal(_("Column <%s> not found in table <%s>") % (column, table))
 
-    if not shellstyle:
+    if output_format == "plain":
         gscript.verbose(
             _("Calculation for column <%s> of table <%s>...") % (column, table)
         )
@@ -145,11 +170,12 @@ def main():
     # check if result is empty
     tmpf = open(tmp)
     if tmpf.read(1) == "":
-        gscript.fatal(_("Table <%s> contains no data.") % table)
+        if output_format in ["plain", "shell"]:
+            gscript.fatal(_("Table <%s> contains no data.") % table)
         tmpf.close()
 
     # calculate statistics
-    if not shellstyle:
+    if output_format == "plain":
         gscript.verbose(_("Calculating statistics..."))
 
     N = 0
@@ -174,9 +200,27 @@ def main():
     tmpf.close()
 
     if N <= 0:
-        gscript.fatal(_("No non-null values found"))
-
-    if not shellstyle:
+        if output_format in ["plain", "shell"]:
+            gscript.fatal(_("No non-null values found"))
+        else:
+            # We produce valid JSON with a value for n even when the query returned
+            # no rows or when all values are nulls.
+            result = {}
+            result["n"] = N
+            nan_value = None
+            result["min"] = nan_value
+            result["max"] = nan_value
+            result["range"] = nan_value
+            result["mean"] = nan_value
+            result["mean_abs"] = nan_value
+            result["variance"] = nan_value
+            result["stddev"] = nan_value
+            result["coeff_var"] = nan_value
+            result["sum"] = nan_value
+            json.dump({"statistics": result}, sys.stdout)
+            return
+
+    if output_format == "plain":
         sys.stdout.write("Number of values: %d\n" % N)
         sys.stdout.write("Minimum: %.15g\n" % minv)
         sys.stdout.write("Maximum: %.15g\n" % maxv)
@@ -197,7 +241,28 @@ def main():
             sys.stdout.write("Standard deviation: 0\n")
             sys.stdout.write("Coefficient of variation: 0\n")
         sys.stdout.write("Sum: %.15g\n" % sum)
-    else:
+    elif output_format == "json":
+        result = {}
+        result["n"] = N
+        result["min"] = minv
+        result["max"] = maxv
+        result["range"] = maxv - minv
+        result["mean"] = sum / N
+        result["mean_abs"] = sum3 / N
+        if not ((sum2 - sum * sum / N) / N) < 0:
+            result["variance"] = (sum2 - sum * sum / N) / N
+            result["stddev"] = math.sqrt((sum2 - sum * sum / N) / N)
+            result["coeff_var"] = (math.sqrt((sum2 - sum * sum / N) / N)) / (
+                math.sqrt(sum * sum) / N
+            )
+        else:
+            result["variance"] = 0
+            result["stddev"] = 0
+            result["coeff_var"] = 0
+        result["sum"] = sum
+        if not extend:
+            json.dump({"statistics": result}, sys.stdout)
+    elif output_format == "shell":
         sys.stdout.write("n=%d\n" % N)
         sys.stdout.write("min=%.15g\n" % minv)
         sys.stdout.write("max=%.15g\n" % maxv)
@@ -216,6 +281,8 @@ def main():
             sys.stdout.write("stddev=0\n")
             sys.stdout.write("coeff_var=0\n")
         sys.stdout.write("sum=%.15g\n" % sum)
+    else:
+        raise ValueError(f"Unknown output format {output_format}")
 
     if not extend:
         return
@@ -266,7 +333,7 @@ def main():
 
     q50 = (q50a + q50b) / 2
 
-    if not shellstyle:
+    if output_format == "plain":
         sys.stdout.write("1st Quartile: %.15g\n" % q25)
         sys.stdout.write("Median (%s N): %.15g\n" % (eostr, q50))
         sys.stdout.write("3rd Quartile: %.15g\n" % q75)
@@ -290,6 +357,17 @@ def main():
                     )
             else:
                 sys.stdout.write("%.15g Percentile: %.15g\n" % (perc[i], pval[i]))
+    elif output_format == "json":
+        result["first_quartile"] = q25
+        result["median"] = q50
+        result["third_quartile"] = q75
+        if options["percentile"]:
+            percentile_values = []
+            for i in range(len(perc)):
+                percentile_values.append(pval[i])
+        result["percentiles"] = perc
+        result["percentile_values"] = percentile_values
+        json.dump({"statistics": result}, sys.stdout)
     else:
         sys.stdout.write("first_quartile=%.15g\n" % q25)
         sys.stdout.write("median=%.15g\n" % q50)

diff --git a/scripts/db.univar/testsuite/test_db_univar.py b/scripts/db.univar/testsuite/test_db_univar.py
@@ -17,7 +17,7 @@ class TestDbUnivar(TestCase):
     map_name = "samples"
 
     @classmethod
-    def setUpClass(cls):  # pylint: disable=invalid-name
+    def setUpClass(cls):
         """Generate vector points in extend larger than raster with values"""
         cls.use_temp_region()
         cls.runModule("g.region", raster="elevation")
@@ -26,17 +26,17 @@ def setUpClass(cls):  # pylint: disable=invalid-name
         cls.runModule(
             "v.db.addtable",
             map=cls.map_name,
-            columns="{} double precision".format(cls.column_name),
+            columns=f"{cls.column_name} double precision",
         )
         cls.runModule(
             "v.what.rast", map=cls.map_name, raster="elevation", column=cls.column_name
         )
 
     @classmethod
-    def tearDownClass(cls):  # pylint: disable=invalid-name
+    def tearDownClass(cls):
         """Remove temporary region and vector"""
-        cls.del_temp_region()
         cls.runModule("g.remove", flags="f", type="vector", name=cls.map_name)
+        cls.del_temp_region()
 
     def test_calculate(self):
         """Check that db.univar runs"""

diff --git a/scripts/v.db.univar/tests/conftest.py b/scripts/v.db.univar/tests/conftest.py
@@ -0,0 +1,69 @@
+"""Fixtures for v.db.univar tests"""
+
+from types import SimpleNamespace
+
+import pytest
+
+import grass.script as gs
+import grass.script.setup as grass_setup
+
+
+def updates_as_transaction(table, cat_column, column, cats, values):
+    """Create SQL statement for categories and values for a given column"""
+    sql = ["BEGIN TRANSACTION"]
+    for cat, value in zip(cats, values):
+        sql.append(f"UPDATE {table} SET {column} = {value} WHERE {cat_column} = {cat};")
+    sql.append("END TRANSACTION")
+    return "\n".join(sql)
+
+
+def value_update_by_category(map_name, layer, column_name, cats, values):
+    """Update column value for multiple rows based on category"""
+    db_info = gs.vector_db(map_name)[layer]
+    table = db_info["table"]
+    database = db_info["database"]
+    driver = db_info["driver"]
+    cat_column = "cat"
+    sql = updates_as_transaction(
+        table=table,
+        cat_column=cat_column,
+        column=column_name,
+        cats=cats,
+        values=values,
+    )
+    gs.write_command(
+        "db.execute", input="-", database=database, driver=driver, stdin=sql
+    )
+
+
+@pytest.fixture(scope="module")
+def simple_dataset(tmp_path_factory):
+    """Creates a session with a mapset which has vector with a float column"""
+    tmp_path = tmp_path_factory.mktemp("simple_dataset")
+    location = "test"
+    map_name = "points"
+    column_name = "double_value"
+    num_points = 10
+    gs.core._create_location_xy(tmp_path, location)  # pylint: disable=protected-access
+    with grass_setup.init(tmp_path / location):
+        gs.run_command("g.region", s=0, n=80, w=0, e=120, b=0, t=50, res=10, res3=10)
+        gs.run_command("v.random", output=map_name, npoints=num_points, seed=42)
+        gs.run_command(
+            "v.db.addtable",
+            map=map_name,
+            columns=f"{column_name} double precision",
+        )
+        cats = list(range(1, 1 + num_points))
+        values = [float(i) + 0.11 for i in range(100, 100 + num_points)]
+        value_update_by_category(
+            map_name=map_name,
+            layer=1,
+            column_name=column_name,
+            cats=cats,
+            values=values,
+        )
+        yield SimpleNamespace(
+            vector_name=map_name,
+            column_name=column_name,
+            values=values,
+        )