Skip to content

Commit

Permalink
v.db.univar: Add JSON output (OSGeo#2386)
Browse files Browse the repository at this point in the history
* Add JSON output to db.univar.
* Add JSON from db.univar to v.db.univar.
* All formats now handled through the format option.
* Output percentiles as two lists, not a mapping.

Tests:

* Old tests fixed.
* New tests are using pytest.
* Fixed values for test obtained from the plain output, so the new JSON output is checked to fit with the original.
* The tests with computation using NumPy would fail with different tests data, i.e., using n other than 10, fails the tests.
  • Loading branch information
wenzeslaus authored and ninsbl committed Feb 17, 2023
1 parent 7dc3295 commit e3d1676
Show file tree
Hide file tree
Showing 7 changed files with 384 additions and 34 deletions.
94 changes: 86 additions & 8 deletions scripts/db.univar/db.univar.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@
# % options: 0-100
# % multiple: yes
# %end
# %option
# % key: format
# % type: string
# % multiple: no
# % options: plain,json,shell
# % label: Output format
# % descriptions: plain;Plain text output;json;JSON (JavaScript Object Notation);shell;Shell script style for Bash eval
# %end
# %flag
# % key: e
# % description: Extended statistics (quartiles and 90th percentile)
Expand All @@ -57,6 +65,7 @@

import sys
import atexit
import json
import math

import grass.script as gscript
Expand Down Expand Up @@ -88,6 +97,11 @@ def sortfile(infile, outfile):


def main():
# A more substantial rewrite of the code is needed, possibly to C or
# using Python packages such as statistics or NumPy,
# so ignoring the duplication of final computation of some statistics
# as well as pushing the limit of how long the function can be.
# pylint: disable=too-many-branches
global tmp
tmp = gscript.tempfile()

Expand All @@ -99,9 +113,20 @@ def main():
driver = options["driver"]
where = options["where"]
perc = options["percentile"]
output_format = options["format"]

perc = [float(p) for p in perc.split(",")]

if not output_format:
if shellstyle:
output_format = "shell"
else:
output_format = "plain"
elif shellstyle:
# This can be a message or warning in future versions.
# In version 9, -g may be removed.
gscript.verbose(_("The format option is used and -g flag ignored"))

desc_table = gscript.db_describe(table, database=database, driver=driver)
if not desc_table:
gscript.fatal(_("Unable to describe table <%s>") % table)
Expand All @@ -114,7 +139,7 @@ def main():
if not found:
gscript.fatal(_("Column <%s> not found in table <%s>") % (column, table))

if not shellstyle:
if output_format == "plain":
gscript.verbose(
_("Calculation for column <%s> of table <%s>...") % (column, table)
)
Expand Down Expand Up @@ -145,11 +170,12 @@ def main():
# check if result is empty
tmpf = open(tmp)
if tmpf.read(1) == "":
gscript.fatal(_("Table <%s> contains no data.") % table)
if output_format in ["plain", "shell"]:
gscript.fatal(_("Table <%s> contains no data.") % table)
tmpf.close()

# calculate statistics
if not shellstyle:
if output_format == "plain":
gscript.verbose(_("Calculating statistics..."))

N = 0
Expand All @@ -174,9 +200,27 @@ def main():
tmpf.close()

if N <= 0:
gscript.fatal(_("No non-null values found"))

if not shellstyle:
if output_format in ["plain", "shell"]:
gscript.fatal(_("No non-null values found"))
else:
# We produce valid JSON with a value for n even when the query returned
# no rows or when all values are nulls.
result = {}
result["n"] = N
nan_value = None
result["min"] = nan_value
result["max"] = nan_value
result["range"] = nan_value
result["mean"] = nan_value
result["mean_abs"] = nan_value
result["variance"] = nan_value
result["stddev"] = nan_value
result["coeff_var"] = nan_value
result["sum"] = nan_value
json.dump({"statistics": result}, sys.stdout)
return

if output_format == "plain":
sys.stdout.write("Number of values: %d\n" % N)
sys.stdout.write("Minimum: %.15g\n" % minv)
sys.stdout.write("Maximum: %.15g\n" % maxv)
Expand All @@ -197,7 +241,28 @@ def main():
sys.stdout.write("Standard deviation: 0\n")
sys.stdout.write("Coefficient of variation: 0\n")
sys.stdout.write("Sum: %.15g\n" % sum)
else:
elif output_format == "json":
result = {}
result["n"] = N
result["min"] = minv
result["max"] = maxv
result["range"] = maxv - minv
result["mean"] = sum / N
result["mean_abs"] = sum3 / N
if not ((sum2 - sum * sum / N) / N) < 0:
result["variance"] = (sum2 - sum * sum / N) / N
result["stddev"] = math.sqrt((sum2 - sum * sum / N) / N)
result["coeff_var"] = (math.sqrt((sum2 - sum * sum / N) / N)) / (
math.sqrt(sum * sum) / N
)
else:
result["variance"] = 0
result["stddev"] = 0
result["coeff_var"] = 0
result["sum"] = sum
if not extend:
json.dump({"statistics": result}, sys.stdout)
elif output_format == "shell":
sys.stdout.write("n=%d\n" % N)
sys.stdout.write("min=%.15g\n" % minv)
sys.stdout.write("max=%.15g\n" % maxv)
Expand All @@ -216,6 +281,8 @@ def main():
sys.stdout.write("stddev=0\n")
sys.stdout.write("coeff_var=0\n")
sys.stdout.write("sum=%.15g\n" % sum)
else:
raise ValueError(f"Unknown output format {output_format}")

if not extend:
return
Expand Down Expand Up @@ -266,7 +333,7 @@ def main():

q50 = (q50a + q50b) / 2

if not shellstyle:
if output_format == "plain":
sys.stdout.write("1st Quartile: %.15g\n" % q25)
sys.stdout.write("Median (%s N): %.15g\n" % (eostr, q50))
sys.stdout.write("3rd Quartile: %.15g\n" % q75)
Expand All @@ -290,6 +357,17 @@ def main():
)
else:
sys.stdout.write("%.15g Percentile: %.15g\n" % (perc[i], pval[i]))
elif output_format == "json":
result["first_quartile"] = q25
result["median"] = q50
result["third_quartile"] = q75
if options["percentile"]:
percentile_values = []
for i in range(len(perc)):
percentile_values.append(pval[i])
result["percentiles"] = perc
result["percentile_values"] = percentile_values
json.dump({"statistics": result}, sys.stdout)
else:
sys.stdout.write("first_quartile=%.15g\n" % q25)
sys.stdout.write("median=%.15g\n" % q50)
Expand Down
8 changes: 4 additions & 4 deletions scripts/db.univar/testsuite/test_db_univar.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TestDbUnivar(TestCase):
map_name = "samples"

@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
def setUpClass(cls):
"""Generate vector points in extend larger than raster with values"""
cls.use_temp_region()
cls.runModule("g.region", raster="elevation")
Expand All @@ -26,17 +26,17 @@ def setUpClass(cls): # pylint: disable=invalid-name
cls.runModule(
"v.db.addtable",
map=cls.map_name,
columns="{} double precision".format(cls.column_name),
columns=f"{cls.column_name} double precision",
)
cls.runModule(
"v.what.rast", map=cls.map_name, raster="elevation", column=cls.column_name
)

@classmethod
def tearDownClass(cls): # pylint: disable=invalid-name
def tearDownClass(cls):
"""Remove temporary region and vector"""
cls.del_temp_region()
cls.runModule("g.remove", flags="f", type="vector", name=cls.map_name)
cls.del_temp_region()

def test_calculate(self):
"""Check that db.univar runs"""
Expand Down
69 changes: 69 additions & 0 deletions scripts/v.db.univar/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Fixtures for v.db.univar tests"""

from types import SimpleNamespace

import pytest

import grass.script as gs
import grass.script.setup as grass_setup


def updates_as_transaction(table, cat_column, column, cats, values):
"""Create SQL statement for categories and values for a given column"""
sql = ["BEGIN TRANSACTION"]
for cat, value in zip(cats, values):
sql.append(f"UPDATE {table} SET {column} = {value} WHERE {cat_column} = {cat};")
sql.append("END TRANSACTION")
return "\n".join(sql)


def value_update_by_category(map_name, layer, column_name, cats, values):
"""Update column value for multiple rows based on category"""
db_info = gs.vector_db(map_name)[layer]
table = db_info["table"]
database = db_info["database"]
driver = db_info["driver"]
cat_column = "cat"
sql = updates_as_transaction(
table=table,
cat_column=cat_column,
column=column_name,
cats=cats,
values=values,
)
gs.write_command(
"db.execute", input="-", database=database, driver=driver, stdin=sql
)


@pytest.fixture(scope="module")
def simple_dataset(tmp_path_factory):
"""Creates a session with a mapset which has vector with a float column"""
tmp_path = tmp_path_factory.mktemp("simple_dataset")
location = "test"
map_name = "points"
column_name = "double_value"
num_points = 10
gs.core._create_location_xy(tmp_path, location) # pylint: disable=protected-access
with grass_setup.init(tmp_path / location):
gs.run_command("g.region", s=0, n=80, w=0, e=120, b=0, t=50, res=10, res3=10)
gs.run_command("v.random", output=map_name, npoints=num_points, seed=42)
gs.run_command(
"v.db.addtable",
map=map_name,
columns=f"{column_name} double precision",
)
cats = list(range(1, 1 + num_points))
values = [float(i) + 0.11 for i in range(100, 100 + num_points)]
value_update_by_category(
map_name=map_name,
layer=1,
column_name=column_name,
cats=cats,
values=values,
)
yield SimpleNamespace(
vector_name=map_name,
column_name=column_name,
values=values,
)
Loading

0 comments on commit e3d1676

Please sign in to comment.