diff --git a/.ci_helpers/run-test.py b/.ci_helpers/run-test.py index bd8c0435b..87dd82904 100644 --- a/.ci_helpers/run-test.py +++ b/.ci_helpers/run-test.py @@ -118,15 +118,24 @@ for k, v in test_to_run.items(): print(f"=== RUNNING {k.upper()} TESTS===") print(f"Touched files: {','.join([os.path.basename(p) for p in v])}") - if k == "root": - file_glob_str = "echopype/tests/test_*.py" - cov_mod_arg = ["--cov=echopype"] + # Run specific test files + # The input files must all starts with "test" in their name + # otherwise module globbing for specific test files will + # be used + if all(f.name.startswith("test") for f in v): + # For specific test files + test_files = [str(p) for p in v] else: - file_glob_str = f"echopype/tests/{k}/*.py" - cov_mod_arg = [f"--cov=echopype/{k}"] - if args.include_cov: - pytest_args = original_pytest_args + cov_mod_arg - test_files = glob.glob(file_glob_str) + # Run all tests in a module + if k == "root": + file_glob_str = "echopype/tests/test_*.py" + cov_mod_arg = ["--cov=echopype"] + else: + file_glob_str = f"echopype/tests/{k}/*.py" + cov_mod_arg = [f"--cov=echopype/{k}"] + if args.include_cov: + pytest_args = original_pytest_args + cov_mod_arg + test_files = glob.glob(file_glob_str) final_args = pytest_args + test_files print(f"Pytest args: {final_args}") exit_code = pytest.main(final_args) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 13c398aca..69b0659a6 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -9,25 +9,20 @@ on: workflow_dispatch: env: - CONDA_ENV: echopype NUM_WORKERS: 2 jobs: test: name: ${{ matrix.python-version }}-build - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest if: ${{ !contains(github.event.head_commit.message, '[skip ci]') }} continue-on-error: ${{ matrix.experimental }} strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10"] # TODO: add back 3.11 once parsed2zarr is fixed + python-version: ["3.9", "3.10", "3.11"] runs-on: [ubuntu-latest] experimental: [false] - include: - - runs-on: ubuntu-latest - python-version: "3.11" - experimental: true services: # TODO: figure out how to update tag when there's a new one minio: @@ -46,30 +41,20 @@ jobs: - name: Set environment variables run: | echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - - name: Setup micromamba - uses: mamba-org/setup-micromamba@v1 + - name: Set up Python + uses: actions/setup-python@v4.7.1 with: - environment-file: .ci_helpers/py${{ matrix.python-version }}.yaml - environment-name: ${{ env.CONDA_ENV }} - cache-environment: true - post-cleanup: 'all' - - name: Print conda environment - shell: bash -l {0} - run: | - micromamba info - micromamba list + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: python -m pip install --upgrade pip - name: Remove docker-compose python - if: ${{ matrix.python-version == '3.10' || matrix.python-version == '3.11' }} - shell: bash -l {0} run: sed -i "/docker-compose/d" requirements-dev.txt - name: Install dev tools - shell: bash -l {0} - run: | - micromamba install -c conda-forge -n ${{ env.CONDA_ENV }} --yes --file requirements-dev.txt + run: python -m pip install -r requirements-dev.txt - name: Install echopype - shell: bash -l {0} - run: | - python -m pip install -e .[plot] + run: python -m pip install -e ".[plot]" + - name: Print installed packages + run: python -m pip list - name: Copying test data to services shell: bash -l {0} run: | diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 800a1e96f..b44acb8b7 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -5,7 +5,6 @@ on: paths-ignore: ["**/docker.yaml", "docs"] env: - CONDA_ENV: echopype NUM_WORKERS: 2 jobs: @@ -17,16 +16,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10"] # TODO: add back 3.11 once parsed2zarr is fixed + python-version: ["3.9", "3.10", "3.11"] runs-on: [ubuntu-latest] experimental: [false] - include: - - runs-on: ubuntu-latest - python-version: "3.11" - experimental: true - defaults: - run: - shell: bash -l {0} services: # TODO: figure out how to update tag when there's a new one minio: @@ -42,35 +34,28 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for all branches and tags. + - name: Set up Python + uses: actions/setup-python@v4.7.1 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: python -m pip install --upgrade pip - name: Set environment variables run: | echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - - name: Setup micromamba - uses: mamba-org/setup-micromamba@v1 - with: - environment-file: .ci_helpers/py${{ matrix.python-version }}.yaml - environment-name: ${{ env.CONDA_ENV }} - cache-environment: true - post-cleanup: 'all' - - name: Print conda environment - run: | - micromamba info - micromamba list - name: Remove docker-compose python - if: ${{ matrix.python-version == '3.10' || matrix.python-version == '3.11' }} run: sed -i "/docker-compose/d" requirements-dev.txt - name: Install dev tools - run: | - micromamba install -c conda-forge -n ${{ env.CONDA_ENV }} --yes --file requirements-dev.txt + run: python -m pip install -r requirements-dev.txt # We only want to install this on one run, because otherwise we'll have # duplicate annotations. - name: Install error reporter if: ${{ matrix.python-version == '3.9' }} - run: | - python -m pip install pytest-github-actions-annotate-failures + run: python -m pip install pytest-github-actions-annotate-failures - name: Install echopype - run: | - python -m pip install -e .[plot] + run: python -m pip install -e ".[plot]" + - name: Print installed packages + run: python -m pip list - name: Copying test data to services run: | python .ci_helpers/docker/setup-services.py --deploy --data-only --http-server ${{ job.services.httpserver.id }} diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 1c5a4d6ca..630c0b002 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -145,13 +145,20 @@ the latter via `minio `_. will execute all tests. The entire test suite can be a bit slow, taking up to 40 minutes or more. If your changes impact only some of the subpackages (``convert``, ``calibrate``, ``preprocess``, etc), you can run ``run-test.py`` with only a subset of tests by passing -as an argument a comma-separated list of the modules that have changed. For example: +as an argument a comma-separated list of the modules that have changed or also run only particular test +files by passing a comma-separated list of test files that you want to run. For example: .. code-block:: bash python .ci_helpers/run-test.py --local --pytest-args="-vv" echopype/calibrate/calibrate_ek.py,echopype/preprocess/noise_est.py will run only tests associated with the ``calibrate`` and ``preprocess`` subpackages. + +.. code-block:: bash + + python .ci_helpers/run-test.py --local --pytest-args="-vv" echopype/tests/convert/test_convert_azfp.py,echopype/tests/clean/test_noise.py + +will run only the tests in the ``test_convert_azfp.py`` and ``test_noise.py`` files. For ``run-test.py`` usage information, use the ``-h`` argument: ``python .ci_helpers/run-test.py -h`` diff --git a/echopype/calibrate/api.py b/echopype/calibrate/api.py index 01fa7630a..c2c34c137 100644 --- a/echopype/calibrate/api.py +++ b/echopype/calibrate/api.py @@ -153,7 +153,7 @@ def compute_Sv(echodata: EchoData, **kwargs) -> xr.Dataset: - for EK60 echosounder, allowed parameters include: `"sa_correction"`, `"gain_correction"`, `"equivalent_beam_angle"` - for AZFP echosounder, allowed parameters include: - `"EL"`, `"DS"`, `"TVR"`, `"VTX"`, `"equivalent_beam_angle"`, `"Sv_offset"` + `"EL"`, `"DS"`, `"TVR"`, `"VTX0"`, `"equivalent_beam_angle"`, `"Sv_offset"` Passing in calibration parameters for other echosounders are not currently supported. @@ -242,7 +242,7 @@ def compute_TS(echodata: EchoData, **kwargs): - for EK60 echosounder, allowed parameters include: `"sa_correction"`, `"gain_correction"`, `"equivalent_beam_angle"` - for AZFP echosounder, allowed parameters include: - `"EL"`, `"DS"`, `"TVR"`, `"VTX"`, `"equivalent_beam_angle"`, `"Sv_offset"` + `"EL"`, `"DS"`, `"TVR"`, `"VTX0"`, `"equivalent_beam_angle"`, `"Sv_offset"` Passing in calibration parameters for other echosounders are not currently supported. diff --git a/echopype/calibrate/cal_params.py b/echopype/calibrate/cal_params.py index 0c861c9e9..5d020cc9b 100644 --- a/echopype/calibrate/cal_params.py +++ b/echopype/calibrate/cal_params.py @@ -29,7 +29,7 @@ "impedance_transceiver", # z_er "receiver_sampling_frequency", ), - "AZFP": ("EL", "DS", "TVR", "VTX", "equivalent_beam_angle", "Sv_offset"), + "AZFP": ("EL", "DS", "TVR", "VTX0", "equivalent_beam_angle", "Sv_offset"), } EK80_DEFAULT_PARAMS = { @@ -352,7 +352,7 @@ def get_cal_params_AZFP(beam: xr.DataArray, vend: xr.DataArray, user_dict: dict) out_dict[p] = beam[p] # has only channel dim # Params from Vendor_specific group - elif p in ["EL", "DS", "TVR", "VTX", "Sv_offset"]: + elif p in ["EL", "DS", "TVR", "VTX0", "Sv_offset"]: out_dict[p] = vend[p] # these params only have the channel dimension return out_dict diff --git a/echopype/calibrate/calibrate_azfp.py b/echopype/calibrate/calibrate_azfp.py index bc2f848f0..1299be8bb 100644 --- a/echopype/calibrate/calibrate_azfp.py +++ b/echopype/calibrate/calibrate_azfp.py @@ -63,7 +63,7 @@ def _cal_power_samples(self, cal_type, **kwargs): # TODO: take care of dividing by zero encountered in log10 spreading_loss = 20 * np.log10(self.range_meter) absorption_loss = 2 * self.env_params["sound_absorption"] * self.range_meter - SL = self.cal_params["TVR"] + 20 * np.log10(self.cal_params["VTX"]) # eq.(2) + SL = self.cal_params["TVR"] + 20 * np.log10(self.cal_params["VTX0"]) # eq.(2) # scaling factor (slope) in Fig.G-1, units Volts/dB], see p.84 a = self.cal_params["DS"] diff --git a/echopype/calibrate/range.py b/echopype/calibrate/range.py index aa93a6061..b3c049c5f 100644 --- a/echopype/calibrate/range.py +++ b/echopype/calibrate/range.py @@ -60,7 +60,7 @@ def compute_range_AZFP(echodata: EchoData, env_params: Dict, cal_type: str) -> x # Notation below follows p.86 of user manual N = vend["number_of_samples_per_average_bin"] # samples per bin f = vend["digitization_rate"] # digitization rate - L = vend["lockout_index"] # number of lockout samples + L = vend["lock_out_index"] # number of lockout samples # keep this in ref of AZFP matlab code, # set to 1 since we want to calculate from raw data diff --git a/echopype/commongrid/__init__.py b/echopype/commongrid/__init__.py index 642434353..63172d2bd 100644 --- a/echopype/commongrid/__init__.py +++ b/echopype/commongrid/__init__.py @@ -1,6 +1,7 @@ -from .api import compute_MVBS, compute_MVBS_index_binning +from .api import compute_MVBS, compute_MVBS_index_binning, compute_NASC __all__ = [ "compute_MVBS", + "compute_NASC", "compute_MVBS_index_binning", ] diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index ee71c12ec..396a943be 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -1,72 +1,44 @@ """ Functions for enhancing the spatial and temporal coherence of data. """ +import logging +from typing import Literal + import numpy as np import pandas as pd import xarray as xr +from ..consolidate.api import POSITION_VARIABLES from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level -from .mvbs import get_MVBS_along_channels - - -def _set_var_attrs(da, long_name, units, round_digits, standard_name=None): - """ - Attach common attributes to DataArray variable. +from .utils import ( + _convert_bins_to_interval_index, + _get_reduced_positions, + _parse_x_bin, + _set_MVBS_attrs, + _set_var_attrs, + _setup_and_validate, + compute_raw_MVBS, + compute_raw_NASC, + get_distance_from_latlon, +) - Parameters - ---------- - da : xr.DataArray - DataArray that will receive attributes - long_name : str - Variable long_name attribute - units : str - Variable units attribute - round_digits : int - Number of digits after decimal point for rounding off actual_range - standard_name : str - CF standard_name, if available (optional) - """ - - da.attrs = { - "long_name": long_name, - "units": units, - "actual_range": [ - round(float(da.min().values), round_digits), - round(float(da.max().values), round_digits), - ], - } - if standard_name: - da.attrs["standard_name"] = standard_name - - -def _set_MVBS_attrs(ds): - """ - Attach common attributes. - - Parameters - ---------- - ds : xr.Dataset - dataset containing MVBS - """ - ds["ping_time"].attrs = { - "long_name": "Ping time", - "standard_name": "time", - "axis": "T", - } - - _set_var_attrs( - ds["Sv"], - long_name="Mean volume backscattering strength (MVBS, mean Sv re 1 m-1)", - units="dB", - round_digits=2, - ) +logger = logging.getLogger(__name__) @add_processing_level("L3*") -def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): +def compute_MVBS( + ds_Sv: xr.Dataset, + range_var: Literal["echo_range", "depth"] = "echo_range", + range_bin: str = "20m", + ping_time_bin: str = "20S", + method="map-reduce", + closed: Literal["left", "right"] = "left", + **flox_kwargs, +): """ Compute Mean Volume Backscattering Strength (MVBS) - based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units. + based on intervals of range (``echo_range``) or depth (``depth``) + and ``ping_time`` specified in physical units. Output of this function differs from that of ``compute_MVBS_index_binning``, which computes bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as @@ -76,41 +48,83 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): ---------- ds_Sv : xr.Dataset dataset containing Sv and ``echo_range`` [m] - range_meter_bin : Union[int, float] - bin size along ``echo_range`` in meters, default to ``20`` - ping_time_bin : str - bin size along ``ping_time``, default to ``20S`` + range_var: {'echo_range', 'depth'}, default 'echo_range' + The variable to use for range binning. + Must be one of ``echo_range`` or ``depth``. + Note that ``depth`` is only available if the input dataset contains + ``depth`` as a data variable. + range_bin : str, default '20m' + bin size along ``echo_range`` or ``depth`` in meters. + ping_time_bin : str, default '20S' + bin size along ``ping_time`` + method: str, default 'map-reduce' + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + closed: {'left', 'right'}, default 'left' + Which side of bin interval is closed. + **flox_kwargs + Additional keyword arguments to be passed + to flox reduction function. Returns ------- A dataset containing bin-averaged Sv """ + # Setup and validate + # * Sv dataset must contain specified range_var + # * Parse range_bin + # * Check closed value + ds_Sv, range_bin = _setup_and_validate(ds_Sv, range_var, range_bin, closed) + + if not isinstance(ping_time_bin, str): + raise TypeError("ping_time_bin must be a string") + # create bin information for echo_range - range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) + # this computes the echo range max since there might NaNs in the data + echo_range_max = ds_Sv[range_var].max() + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) # create bin information needed for ping_time - ping_interval = ( - ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .first() # Not actually being used, but needed to get the bin groups + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values + + # Set interval index for groups + ping_interval = _convert_bins_to_interval_index(ping_interval, closed=closed) + range_interval = _convert_bins_to_interval_index(range_interval, closed=closed) + raw_MVBS = compute_raw_MVBS( + ds_Sv, + range_interval, + ping_interval, + range_var=range_var, + method=method, + **flox_kwargs, ) - - # calculate the MVBS along each channel - MVBS_values = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval) # create MVBS dataset + # by transforming the binned dimensions to regular coords ds_MVBS = xr.Dataset( - data_vars={"Sv": (["channel", "ping_time", "echo_range"], MVBS_values)}, + data_vars={"Sv": (["channel", "ping_time", range_var], raw_MVBS["Sv"].data)}, coords={ - "ping_time": ping_interval, - "channel": ds_Sv.channel, - "echo_range": range_interval[:-1], + "ping_time": np.array([v.left for v in raw_MVBS.ping_time_bins.values]), + "channel": raw_MVBS.channel.values, + range_var: np.array([v.left for v in raw_MVBS[f"{range_var}_bins"].values]), }, ) - # TODO: look into why 'filenames' exist here as a variable - # Added this check to support the test in test_process.py::test_compute_MVBS - if "filenames" in ds_MVBS.variables: - ds_MVBS = ds_MVBS.drop_vars("filenames") + # If dataset has position information + # propagate this to the final MVBS dataset + ds_MVBS = _get_reduced_positions(ds_Sv, ds_MVBS, "MVBS", ping_interval) + + # Add water level if uses echo_range and it exists in Sv dataset + if range_var == "echo_range" and "water_level" in ds_Sv.data_vars: + ds_MVBS["water_level"] = ds_Sv["water_level"] # ping_time_bin parsing and conversions # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings @@ -143,17 +157,17 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): # Attach attributes _set_MVBS_attrs(ds_MVBS) - ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"} + ds_MVBS[range_var].attrs = {"long_name": "Range distance", "units": "m"} ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs( { "cell_methods": ( f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} " # noqa "comment: ping_time is the interval start) " - f"echo_range: mean (interval: {range_meter_bin} meter " - "comment: echo_range is the interval start)" + f"{range_var}: mean (interval: {range_bin} meter " + f"comment: {range_var} is the interval start)" ), "binning_mode": "physical units", - "range_meter_interval": str(range_meter_bin) + "m", + "range_meter_interval": str(range_bin) + "m", "ping_time_interval": ping_time_bin, "actual_range": [ round(float(ds_MVBS["Sv"].min().values), 2), @@ -247,131 +261,148 @@ def compute_MVBS_index_binning(ds_Sv, range_sample_num=100, ping_num=100): return ds_MVBS -# def compute_NASC( -# ds_Sv: xr.Dataset, -# cell_dist: Union[int, float], # TODO: allow xr.DataArray -# cell_depth: Union[int, float], # TODO: allow xr.DataArray -# ) -> xr.Dataset: -# """ -# Compute Nautical Areal Scattering Coefficient (NASC) from an Sv dataset. - -# Parameters -# ---------- -# ds_Sv : xr.Dataset -# A dataset containing Sv data. -# The Sv dataset must contain ``latitude``, ``longitude``, and ``depth`` as data variables. -# cell_dist: int, float -# The horizontal size of each NASC cell, in nautical miles [nmi] -# cell_depth: int, float -# The vertical size of each NASC cell, in meters [m] - -# Returns -# ------- -# xr.Dataset -# A dataset containing NASC - -# Notes -# ----- -# The NASC computation implemented here corresponds to the Echoview algorithm PRC_NASC -# https://support.echoview.com/WebHelp/Reference/Algorithms/Analysis_Variables/PRC_ABC_and_PRC_NASC.htm#PRC_NASC # noqa -# The difference is that since in echopype masking of the Sv dataset is done explicitly using -# functions in the ``mask`` subpackage so the computation only involves computing the -# mean Sv and the mean height within each cell. - -# In addition, here the binning of pings into individual cells is based on the actual horizontal -# distance computed from the latitude and longitude coordinates of each ping in the Sv dataset. -# Therefore, both regular and irregular horizontal distance in the Sv dataset are allowed. -# This is different from Echoview's assumption of constant ping rate, vessel speed, and sample -# thickness when computing mean Sv. -# """ -# # Check Sv contains lat/lon -# if "latitude" not in ds_Sv or "longitude" not in ds_Sv: -# raise ValueError("Both 'latitude' and 'longitude' must exist in the input Sv dataset.") - -# # Check if depth vectors are identical within each channel -# if not ds_Sv["depth"].groupby("channel").map(check_identical_depth).all(): -# raise ValueError( -# "Only Sv data with identical depth vectors across all pings " -# "are allowed in the current compute_NASC implementation." -# ) - -# # Get distance from lat/lon in nautical miles -# dist_nmi = get_distance_from_latlon(ds_Sv) - -# # Find binning indices along distance -# bin_num_dist, dist_bin_idx = get_dist_bin_info(dist_nmi, cell_dist) # dist_bin_idx is 1-based - -# # Find binning indices along depth: channel-dependent -# bin_num_depth, depth_bin_idx = get_depth_bin_info(ds_Sv, cell_depth) # depth_bin_idx is 1-based # noqa - -# # Compute mean sv (volume backscattering coefficient, linear scale) -# # This is essentially to compute MVBS over a the cell defined here, -# # which are typically larger than those used for MVBS. -# # The implementation below is brute force looping, but can be optimized -# # by experimenting with different delayed schemes. -# # The optimized routines can then be used here and -# # in commongrid.compute_MVBS and clean.estimate_noise -# sv_mean = [] -# for ch_seq in np.arange(ds_Sv["channel"].size): -# # TODO: .compute each channel sequentially? -# # dask.delay within each channel? -# ds_Sv_ch = ds_Sv["Sv"].isel(channel=ch_seq).data # preserve the underlying type - -# sv_mean_dist_depth = [] -# for dist_idx in np.arange(bin_num_dist) + 1: # along ping_time -# sv_mean_depth = [] -# for depth_idx in np.arange(bin_num_depth) + 1: # along depth -# # Sv dim: ping_time x depth -# Sv_cut = ds_Sv_ch[dist_idx == dist_bin_idx, :][ -# :, depth_idx == depth_bin_idx[ch_seq] -# ] -# sv_mean_depth.append(np.nanmean(10 ** (Sv_cut / 10))) -# sv_mean_dist_depth.append(sv_mean_depth) - -# sv_mean.append(sv_mean_dist_depth) - -# # Compute mean height -# # For data with uniform depth step size, mean height = vertical size of cell -# height_mean = cell_depth -# # TODO: generalize to variable depth step size - -# ds_NASC = xr.DataArray( -# np.array(sv_mean) * height_mean, -# dims=["channel", "distance", "depth"], -# coords={ -# "channel": ds_Sv["channel"].values, -# "distance": np.arange(bin_num_dist) * cell_dist, -# "depth": np.arange(bin_num_depth) * cell_depth, -# }, -# name="NASC", -# ).to_dataset() - -# ds_NASC["frequency_nominal"] = ds_Sv["frequency_nominal"] # re-attach frequency_nominal - -# # Attach attributes -# _set_var_attrs( -# ds_NASC["NASC"], -# long_name="Nautical Areal Scattering Coefficient (NASC, m2 nmi-2)", -# units="m2 nmi-2", -# round_digits=3, -# ) -# _set_var_attrs(ds_NASC["distance"], "Cumulative distance", "m", 3) -# _set_var_attrs(ds_NASC["depth"], "Cell depth", "m", 3, standard_name="depth") - -# # Calculate and add ACDD bounding box global attributes -# ds_NASC.attrs["Conventions"] = "CF-1.7,ACDD-1.3" -# ds_NASC.attrs["time_coverage_start"] = np.datetime_as_string( -# ds_Sv["ping_time"].min().values, timezone="UTC" -# ) -# ds_NASC.attrs["time_coverage_end"] = np.datetime_as_string( -# ds_Sv["ping_time"].max().values, timezone="UTC" -# ) -# ds_NASC.attrs["geospatial_lat_min"] = round(float(ds_Sv["latitude"].min().values), 5) -# ds_NASC.attrs["geospatial_lat_max"] = round(float(ds_Sv["latitude"].max().values), 5) -# ds_NASC.attrs["geospatial_lon_min"] = round(float(ds_Sv["longitude"].min().values), 5) -# ds_NASC.attrs["geospatial_lon_max"] = round(float(ds_Sv["longitude"].max().values), 5) - -# return ds_NASC +@add_processing_level("L4") +def compute_NASC( + ds_Sv: xr.Dataset, + range_bin: str = "10m", + dist_bin: str = "0.5nmi", + method: str = "map-reduce", + closed: Literal["left", "right"] = "left", + **flox_kwargs, +) -> xr.Dataset: + """ + Compute Nautical Areal Scattering Coefficient (NASC) from an Sv dataset. + + Parameters + ---------- + ds_Sv : xr.Dataset + A dataset containing Sv data. + The Sv dataset must contain ``latitude``, ``longitude``, and ``depth`` as data variables. + range_bin : str, default '10m' + bin size along ``depth`` in meters (m). + dist_bin : str, default '0.5nmi' + bin size along ``distance`` in nautical miles (nmi). + method: str, default 'map-reduce' + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + closed: {'left', 'right'}, default 'left' + Which side of bin interval is closed. + **flox_kwargs + Additional keyword arguments to be passed + to flox reduction function. + + Returns + ------- + xr.Dataset + A dataset containing NASC + + Notes + ----- + The NASC computation implemented here generally corresponds to the Echoview algorithm PRC_NASC + https://support.echoview.com/WebHelp/Reference/Algorithms/Analysis_Variables/PRC_ABC_and_PRC_NASC.htm#PRC_NASC # noqa + The difference is that since in echopype masking of the Sv dataset is done explicitly using + functions in the ``mask`` subpackage, the computation only involves computing the + mean Sv and the mean height within each cell, where some Sv "pixels" may have been + masked as NaN. + + In addition, in echopype the binning of pings into individual cells is based on the actual horizontal + distance computed from the latitude and longitude coordinates of each ping in the Sv dataset. + Therefore, both regular and irregular horizontal distance in the Sv dataset are allowed. + This is different from Echoview's assumption of constant ping rate, vessel speed, and sample + thickness when computing mean Sv + (see https://support.echoview.com/WebHelp/Reference/Algorithms/Analysis_Variables/Sv_mean.htm#Conversions). # noqa + """ + # Set range_var to be 'depth' + range_var = "depth" + + # Setup and validate + # * Sv dataset must contain latitude, longitude, and depth + # * Parse range_bin + # * Check closed value + ds_Sv, range_bin = _setup_and_validate( + ds_Sv, range_var, range_bin, closed, required_data_vars=POSITION_VARIABLES + ) + + # Check if dist_bin is a string + if not isinstance(dist_bin, str): + raise TypeError("dist_bin must be a string") + + # Parse the dist_bin string and convert to float + dist_bin = _parse_x_bin(dist_bin, "dist_bin") + + # Get distance from lat/lon in nautical miles + dist_nmi = get_distance_from_latlon(ds_Sv) + ds_Sv = ds_Sv.assign_coords({"distance_nmi": ("ping_time", dist_nmi)}).swap_dims( + {"ping_time": "distance_nmi"} + ) + + # create bin information along range_var + # this computes the range_var max since there might NaNs in the data + range_var_max = ds_Sv[range_var].max() + range_interval = np.arange(0, range_var_max + range_bin, range_bin) + + # create bin information along distance_nmi + # this computes the distance max since there might NaNs in the data + dist_max = ds_Sv["distance_nmi"].max() + dist_interval = np.arange(0, dist_max + dist_bin, dist_bin) + + # Set interval index for groups + dist_interval = _convert_bins_to_interval_index(dist_interval, closed=closed) + range_interval = _convert_bins_to_interval_index(range_interval, closed=closed) + + raw_NASC = compute_raw_NASC( + ds_Sv, + range_interval, + dist_interval, + method=method, + **flox_kwargs, + ) + + # create MVBS dataset + # by transforming the binned dimensions to regular coords + ds_NASC = xr.Dataset( + data_vars={"NASC": (["channel", "distance", range_var], raw_NASC["sv"].data)}, + coords={ + "distance": np.array([v.left for v in raw_NASC["distance_nmi_bins"].values]), + "channel": raw_NASC["channel"].values, + range_var: np.array([v.left for v in raw_NASC[f"{range_var}_bins"].values]), + }, + ) + + # If dataset has position information + # propagate this to the final NASC dataset + ds_NASC = _get_reduced_positions(ds_Sv, ds_NASC, "NASC", dist_interval) + + # Set ping time binning information + ds_NASC["ping_time"] = (["distance"], raw_NASC["ping_time"].data, ds_Sv["ping_time"].attrs) + + ds_NASC["frequency_nominal"] = ds_Sv["frequency_nominal"] # re-attach frequency_nominal + + # Attach attributes + _set_var_attrs( + ds_NASC["NASC"], + long_name="Nautical Areal Scattering Coefficient (NASC, m2 nmi-2)", + units="m2 nmi-2", + round_digits=3, + ) + _set_var_attrs(ds_NASC["distance"], "Cumulative distance", "nmi", 3) + _set_var_attrs(ds_NASC["depth"], "Cell depth", "m", 3, standard_name="depth") + + # Calculate and add ACDD bounding box global attributes + ds_NASC.attrs["Conventions"] = "CF-1.7,ACDD-1.3" + ds_NASC.attrs["time_coverage_start"] = np.datetime_as_string( + ds_Sv["ping_time"].min().values, timezone="UTC" + ) + ds_NASC.attrs["time_coverage_end"] = np.datetime_as_string( + ds_Sv["ping_time"].max().values, timezone="UTC" + ) + ds_NASC.attrs["geospatial_lat_min"] = round(float(ds_Sv["latitude"].min().values), 5) + ds_NASC.attrs["geospatial_lat_max"] = round(float(ds_Sv["latitude"].max().values), 5) + ds_NASC.attrs["geospatial_lon_min"] = round(float(ds_Sv["longitude"].min().values), 5) + ds_NASC.attrs["geospatial_lon_max"] = round(float(ds_Sv["longitude"].max().values), 5) + + return ds_NASC def regrid(): diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py deleted file mode 100644 index 2c0006f35..000000000 --- a/echopype/commongrid/mvbs.py +++ /dev/null @@ -1,461 +0,0 @@ -""" -Contains core functions needed to compute the MVBS of an input dataset. -""" - -import warnings -from typing import Tuple, Union - -import dask.array -import numpy as np -import xarray as xr - - -def get_bin_indices( - echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray -) -> Tuple[np.ndarray, np.ndarray]: - """ - Obtains the bin index of ``echo_range`` and ``times`` based - on the binning ``bins_er`` and ``bins_time``, respectively. - - Parameters - ---------- - echo_range: np.ndarray - 2D array of echo range values - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - - Returns - ------- - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - bin_time_ind: np.ndarray - 1D array of bin indices for ``times`` - """ - - # get bin index for each echo range value - digitized_echo_range = np.digitize(echo_range, bins_er, right=False) - - # turn datetime into integers, so we can use np.digitize - if isinstance(times, dask.array.Array): - times_i8 = times.compute().data.view("i8") - else: - times_i8 = times.view("i8") - - # turn datetime into integers, so we can use np.digitize - bins_time_i8 = bins_time.view("i8") - - # get bin index for each time - bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) - - return digitized_echo_range, bin_time_ind - - -def bin_and_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int -) -> Union[np.ndarray, dask.array.Array]: - """ - Bins and means ``arr`` with respect to the ``echo_range`` bins. - - Parameters - ---------- - arr: np.ndarray or dask.array.Array - 2D array (dimension: [``echo_range`` x ``ping_time``]) to bin along ``echo_range`` - and compute mean of each bin - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: np.ndarray or dask.array.Array - 2D array representing the bin and mean of ``arr`` along ``echo_range`` - """ - - binned_means = [] - for bin_er in range(1, n_bin_er): - # Catch a known warning that can occur, which does not impact the results - with warnings.catch_warnings(): - # ignore warnings caused by taking a mean of an array filled with NaNs - warnings.filterwarnings(action="ignore", message="Mean of empty slice") - - # bin and mean echo_range dimension - er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) - - # collect all echo_range bins - binned_means.append(er_selected_data) - - # create full echo_range binned array - er_means = np.vstack(binned_means) - - return er_means - - -def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: - """ - Obtains those row indices of ``mat`` that are not equal - to ``row``. - - Parameters - ---------- - mat: np.ndarray - 2D array with the same column dimension as the number - of elements in ``row`` - row: np.ndarray - 1D array with the same number of element elements as - the column dimension of ``mat`` - - Returns - ------- - row_ind_not_equal: np.ndarray - The row indices of ``mat`` that are not equal to ``row`` - - Notes - ----- - Elements with NaNs are considered equal if they are in the same position. - """ - - # compare row against all rows in mat (allowing for NaNs to be equal) - element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) - - # determine if mat row is equal to row - row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1)) - - if isinstance(row_not_equal, dask.array.Array): - row_not_equal = row_not_equal.compute() - - # get those row indices that are not equal to row - row_ind_not_equal = np.argwhere(row_not_equal).flatten() - - return row_ind_not_equal - - -def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - A comprehensive check that determines if all ``echo_range`` values - along ``ping_time`` have the same step size. If they do not have - the same step sizes, then grouping of the ``echo_range`` values - will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # grab the in-memory numpy echo_range values, if necessary - if isinstance(er_chan, xr.DataArray): - er_chan = er_chan.values - - # grab the first ping_time that is not filled with NaNs - ping_index = 0 - while np.all(np.isnan(er_chan[ping_index, :])): - ping_index += 1 - - # determine those rows of er_chan that are not equal to the row ping_index - unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) - - if len(unequal_ping_ind) > 0: - # see if all unequal_ping_ind are filled with NaNs - all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1)) - - if all_nans: - # All echo_range values have the same step size - return False - else: - # Some echo_range values have different step sizes - return True - else: - # All echo_range values have the same step size - return False - - -def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - An alternative (less comprehensive) check that determines if all - ``echo_range`` values along ``ping_time`` have the same step size. - If they do not have the same step sizes, then grouping of the - ``echo_range`` values will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - It is possible that this method will incorrectly determine if grouping - is necessary. - - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # determine the number of NaNs in each ping and find the unique number of NaNs - unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) - - # compute the results, if necessary, to allow for downstream checks - if isinstance(unique_num_nans, dask.array.Array): - unique_num_nans = unique_num_nans.compute() - - # determine if any value is not 0 or er_chan.shape[1] - unexpected_num_nans = False in np.logical_or( - unique_num_nans == 0, unique_num_nans == er_chan.shape[1] - ) - - if unexpected_num_nans: - # echo_range varies with ping_time - return True - else: - # make sure that the final echo_range value for each ping_time is the same (account for NaN) - num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() - - # compute the results, if necessary, to allow for downstream checks - if isinstance(num_non_nans, dask.array.Array): - num_non_nans = num_non_nans.compute() - - if num_non_nans > 1: - # echo_range varies with ping_time - return True - else: - # echo_range does not vary with ping_time - return False - - -def is_er_grouping_needed( - echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool -) -> bool: - """ - Determines if ``echo_range`` values along ``ping_time`` can change and - thus need to be grouped. - - Parameters - ---------- - echo_range: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - bool - If True grouping of ``echo_range`` will be required, else it will not - be necessary - """ - - if comprehensive_er_check: - return if_all_er_steps_identical(echo_range) - else: - return if_last_er_steps_identical(echo_range) - - -def group_dig_er_bin_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], - digitized_echo_range: Union[np.ndarray, dask.array.Array], - n_bin_er: int, -) -> Union[np.ndarray, dask.array.Array]: - """ - Groups the rows of ``arr`` such that they have the same corresponding - row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range`` - on each group, and lastly assembles the correctly ordered ``er_means`` array - representing the bin and mean of ``arr`` with respect to ``echo_range``. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - digitized_echo_range: dask.array.Array or np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: dask.array.Array or np.ndarray - The bin and mean of ``arr`` with respect to ``echo_range`` - """ - - # compute bin indices to allow for downstream processes (mainly axis argument in unique) - if isinstance(digitized_echo_range, dask.array.Array): - digitized_echo_range = digitized_echo_range.compute() - - # determine the unique rows of digitized_echo_range and the inverse - unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) - - # create groups of row indices using the unique inverse - grps_same_ind = [ - np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) - ] - - # for each group bin and mean arr along echo_range - # note: the values appended may not be in the correct final order - binned_er = [] - for count, grp in enumerate(grps_same_ind): - binned_er.append( - bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) - ) - - # construct er_means and put the columns in the correct order - binned_er_array = np.hstack(binned_er) - correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) - er_means = binned_er_array[:, correct_column_ind] - - return er_means - - -def bin_and_mean_2d( - arr: Union[dask.array.Array, np.ndarray], - bins_time: np.ndarray, - bins_er: np.ndarray, - times: np.ndarray, - echo_range: np.ndarray, - comprehensive_er_check: bool = True, -) -> np.ndarray: - """ - Bins and means ``arr`` based on ``times`` and ``echo_range``, - and their corresponding bins. If ``arr`` is ``Sv`` then this - will compute the MVBS. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - echo_range: np.ndarray - 2D array of echo range values - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - final_reduced: np.ndarray - The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS - - Notes - ----- - This function assumes that ``arr`` has rows corresponding to - ``ping_time`` and columns corresponding to ``echo_range``. - - This function should not be run if the number of ``echo_range`` values - vary amongst ``ping_times``. This should not occur for our current use - of echopype-generated Sv data. - """ - - # get the number of echo range and time bins - n_bin_er = len(bins_er) - n_bin_time = len(bins_time) - - # obtain the bin indices for echo_range and times - digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - - # determine if grouping of echo_range values with the same step size is necessary - er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check) - - if er_grouping_needed: - # groups, bins, and means arr with respect to echo_range - er_means = group_dig_er_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) - else: - # bin and mean arr with respect to echo_range - er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) - - # if er_means is a dask array we compute it so the graph does not get too large - if isinstance(er_means, dask.array.Array): - er_means = er_means.compute() - - # create final reduced array i.e. MVBS - final = np.empty((n_bin_time, n_bin_er - 1)) - for bin_time in range(1, n_bin_time + 1): - # obtain er_mean indices corresponding to the time bin - indices = np.argwhere(bin_time_ind == bin_time).flatten() - - if len(indices) == 0: - # fill values with NaN, if there are no values in the bin - final[bin_time - 1, :] = np.nan - else: - # bin and mean the er_mean time bin - final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) - - return final - - -def get_MVBS_along_channels( - ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray -) -> np.ndarray: - """ - Computes the MVBS of ``ds_Sv`` along each channel for the given - intervals. - - Parameters - ---------- - ds_Sv: xr.Dataset - A Dataset containing ``Sv`` and ``echo_range`` data with coordinates - ``channel``, ``ping_time``, and ``range_sample`` - echo_range_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - ping_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``ping_time`` - - Returns - ------- - np.ndarray - The MVBS value of the input ``ds_Sv`` for all channels - - Notes - ----- - If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with - respect to ``echo_range`` will take place, then the delayed result will be computed, - and lastly the binning and mean with respect to ``ping_time`` will be completed. It - is necessary to apply a compute midway through this method because Dask graph layers - get too large and this makes downstream operations very inefficient. - """ - - all_MVBS = [] - for chan in ds_Sv.channel: - # squeeze to remove "channel" dim if present - # TODO: not sure why not already removed for the AZFP case. Investigate. - ds = ds_Sv.sel(channel=chan).squeeze() - - # average should be done in linear domain - sv = 10 ** (ds["Sv"] / 10) - - # get MVBS for channel in linear domain - chan_MVBS = bin_and_mean_2d( - sv.data, - bins_time=ping_interval, - bins_er=echo_range_interval, - times=sv.ping_time.data, - echo_range=ds["echo_range"], - comprehensive_er_check=True, - ) - - # apply inverse mapping to get back to the original domain and store values - all_MVBS.append(10 * np.log10(chan_MVBS)) - - # collect the MVBS values for each channel - return np.stack(all_MVBS, axis=0) diff --git a/echopype/commongrid/nasc.py b/echopype/commongrid/nasc.py deleted file mode 100644 index 2656f2bc7..000000000 --- a/echopype/commongrid/nasc.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -An overhaul is required for the below compute_NASC implementation, to: -- increase the computational efficiency -- debug the current code of any discrepancy against Echoview implementation -- potentially provide an alternative based on ping-by-ping Sv - -This script contains functions used by commongrid.compute_NASC, -but a subset of these overlap with operations needed -for commongrid.compute_MVBS and clean.estimate_noise. -The compute_MVBS and remove_noise code needs to be refactored, -and the compute_NASC needs to be optimized. -The plan is to create a common util set of functions for use in -these functions in an upcoming release. -""" - -import numpy as np -import xarray as xr -from geopy import distance - - -def check_identical_depth(ds_ch): - """ - Check if all pings have the same depth vector. - """ - # Depth vector are identical for all pings, if: - # - the number of non-NaN range_sample is the same for all pings, AND - # - all pings have the same max range - num_nan = np.isnan(ds_ch.values).sum(axis=1) - nan_check = True if np.all(num_nan == 0) or np.unique(num_nan).size == 1 else False - - if not nan_check: - return xr.DataArray(False, coords={"channel": ds_ch["channel"]}) - else: - # max range of each ping should be identical - max_range_ping = ds_ch.values[np.arange(ds_ch.shape[0]), ds_ch.shape[1] - num_nan - 1] - if np.unique(max_range_ping).size == 1: - return xr.DataArray(True, coords={"channel": ds_ch["channel"]}) - else: - return xr.DataArray(False, coords={"channel": ds_ch["channel"]}) - - -def get_depth_bin_info(ds_Sv, cell_depth): - """ - Find binning indices along depth - """ - depth_ping1 = ds_Sv["depth"].isel(ping_time=0) - num_nan = np.isnan(depth_ping1.values).sum(axis=1) - # ping 1 max range of each channel - max_range_ch = depth_ping1.values[ - np.arange(depth_ping1.shape[0]), depth_ping1.shape[1] - num_nan - 1 - ] - bin_num_depth = np.ceil(max_range_ch.max() / cell_depth) # use max range of all channel - depth_bin_idx = [ - np.digitize(dp1, np.arange(bin_num_depth + 1) * cell_depth, right=False) - for dp1 in depth_ping1 - ] - return bin_num_depth, depth_bin_idx - - -def get_distance_from_latlon(ds_Sv): - # Get distance from lat/lon in nautical miles - df_pos = ds_Sv["latitude"].to_dataframe().join(ds_Sv["longitude"].to_dataframe()) - df_pos["latitude_prev"] = df_pos["latitude"].shift(-1) - df_pos["longitude_prev"] = df_pos["longitude"].shift(-1) - df_latlon_nonan = df_pos.dropna().copy() - df_latlon_nonan["dist"] = df_latlon_nonan.apply( - lambda x: distance.distance( - (x["latitude"], x["longitude"]), - (x["latitude_prev"], x["longitude_prev"]), - ).nm, - axis=1, - ) - df_pos = df_pos.join(df_latlon_nonan["dist"], how="left") - df_pos["dist"] = df_pos["dist"].cumsum() - df_pos["dist"] = df_pos["dist"].fillna(method="ffill").fillna(method="bfill") - - return df_pos["dist"].values - - -def get_dist_bin_info(dist_nmi, cell_dist): - bin_num_dist = np.ceil(dist_nmi.max() / cell_dist) - if np.mod(dist_nmi.max(), cell_dist) == 0: - # increment bin num if last element coincides with bin edge - bin_num_dist = bin_num_dist + 1 - dist_bin_idx = np.digitize(dist_nmi, np.arange(bin_num_dist + 1) * cell_dist, right=False) - return bin_num_dist, dist_bin_idx diff --git a/echopype/commongrid/utils.py b/echopype/commongrid/utils.py new file mode 100644 index 000000000..39d2cd62a --- /dev/null +++ b/echopype/commongrid/utils.py @@ -0,0 +1,557 @@ +import logging +import re +from typing import Literal, Optional, Tuple, Union + +import numpy as np +import pandas as pd +import xarray as xr +from flox.xarray import xarray_reduce +from geopy import distance + +from ..consolidate.api import POSITION_VARIABLES +from ..utils.compute import _lin2log, _log2lin + +logger = logging.getLogger(__name__) + + +def compute_raw_MVBS( + ds_Sv: xr.Dataset, + range_interval: Union[pd.IntervalIndex, np.ndarray], + ping_interval: Union[pd.IntervalIndex, np.ndarray], + range_var: Literal["echo_range", "depth"] = "echo_range", + method="map-reduce", + **flox_kwargs, +): + """ + Compute the raw unformatted MVBS of ``ds_Sv``. + + Parameters + ---------- + ds_Sv : xr.Dataset + A Dataset containing ``Sv`` and ``echo_range`` data + with coordinates ``channel``, ``ping_time``, and ``range_sample`` + at bare minimum. + Or this can contain ``Sv`` and ``depth`` data with similar coordinates. + range_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``range_var`` + ping_interval : pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``ping_time``. + range_var: {'echo_range', 'depth'}, default 'echo_range' + The variable to use for range binning. + Either ``echo_range`` or ``depth``. + method: str + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + **flox_kwargs + Additional keyword arguments to be passed + to flox reduction function. + + Returns + ------- + xr.Dataset + The MVBS dataset of the input ``ds_Sv`` for all channels. + """ + # Set initial variables + ds = xr.Dataset() + x_var = "ping_time" + + sv_mean = _groupby_x_along_channels( + ds_Sv, + range_interval, + x_interval=ping_interval, + x_var=x_var, + range_var=range_var, + method=method, + **flox_kwargs, + ) + + # This is MVBS computation + # apply inverse mapping to get back to the original domain and store values + da_MVBS = sv_mean.pipe(_lin2log) + # return xr.merge([ds_Pos, da_MVBS]) + return xr.merge([ds, da_MVBS]) + + +def compute_raw_NASC( + ds_Sv: xr.Dataset, + range_interval: Union[pd.IntervalIndex, np.ndarray], + dist_interval: Union[pd.IntervalIndex, np.ndarray], + method="map-reduce", + **flox_kwargs, +): + """ + Compute the raw unformatted NASC of ``ds_Sv``. + + Parameters + ---------- + ds_Sv : xr.Dataset + A Dataset containing ``Sv`` and ``depth`` data + with coordinates ``channel``, ``distance_nmi``, and ``range_sample``. + range_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``range_var``. + dist_interval : pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``distance_nmi``. + method: str + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + **flox_kwargs + Additional keyword arguments to be passed + to flox reduction function. + + Returns + ------- + xr.Dataset + The MVBS or NASC dataset of the input ``ds_Sv`` for all channels + """ + # Set initial variables + ds = xr.Dataset() + x_var = "distance_nmi" + range_var = "depth" + + # Determine range_dim for NASC computation + range_dim = "range_sample" + if range_dim not in ds_Sv.dims: + range_dim = "depth" + + sv_mean = _groupby_x_along_channels( + ds_Sv, + range_interval, + x_interval=dist_interval, + x_var=x_var, + range_var=range_var, + method=method, + **flox_kwargs, + ) + + # Get mean ping_time along distance_nmi + # this is only done for NASC computation, + # since for MVBS the ping_time is used for binning already. + ds_ping_time = xarray_reduce( + ds_Sv["ping_time"], + ds_Sv[x_var], + func="nanmean", + expected_groups=(dist_interval), + isbin=True, + method=method, + ) + + # Mean height: approach to use flox + # Numerator (h_mean_num): + # - create a dataarray filled with the first difference of sample height + # with 2D coordinate (distance, depth) + # - flox xarray_reduce along both distance and depth, summing over each 2D bin + # Denominator (h_mean_denom): + # - create a datararray filled with 1, with 1D coordinate (distance) + # - flox xarray_reduce along distance, summing over each 1D bin + # h_mean = N/D + da_denom = xr.ones_like(ds_Sv[x_var]) + h_mean_denom = xarray_reduce( + da_denom, + ds_Sv[x_var], + func="sum", + expected_groups=(dist_interval), + isbin=[True], + method=method, + ) + + h_mean_num = xarray_reduce( + ds_Sv[range_var].diff(dim=range_dim, label="lower"), # use lower end label after diff + ds_Sv["channel"], + ds_Sv[x_var], + ds_Sv[range_var].isel(**{range_dim: slice(0, -1)}), + func="sum", + expected_groups=(None, dist_interval, range_interval), + isbin=[False, True, True], + method=method, + ) + h_mean = h_mean_num / h_mean_denom + + # Combine to compute NASC and name it + raw_NASC = sv_mean * h_mean * 4 * np.pi * 1852**2 + raw_NASC.name = "sv" + + return xr.merge([ds, ds_ping_time, raw_NASC]) + + +def get_distance_from_latlon(ds_Sv): + # Get distance from lat/lon in nautical miles + df_pos = ds_Sv["latitude"].to_dataframe().join(ds_Sv["longitude"].to_dataframe()) + df_pos["latitude_prev"] = df_pos["latitude"].shift(-1) + df_pos["longitude_prev"] = df_pos["longitude"].shift(-1) + df_latlon_nonan = df_pos.dropna().copy() + df_latlon_nonan["dist"] = df_latlon_nonan.apply( + lambda x: distance.distance( + (x["latitude"], x["longitude"]), + (x["latitude_prev"], x["longitude_prev"]), + ).nm, + axis=1, + ) + df_pos = df_pos.join(df_latlon_nonan["dist"], how="left") + df_pos["dist"] = df_pos["dist"].cumsum() + df_pos["dist"] = df_pos["dist"].ffill().bfill() + + return df_pos["dist"].values + + +def _set_var_attrs(da, long_name, units, round_digits, standard_name=None): + """ + Attach common attributes to DataArray variable. + + Parameters + ---------- + da : xr.DataArray + DataArray that will receive attributes + long_name : str + Variable long_name attribute + units : str + Variable units attribute + round_digits : int + Number of digits after decimal point for rounding off actual_range + standard_name : str + CF standard_name, if available (optional) + """ + + da.attrs = { + "long_name": long_name, + "units": units, + "actual_range": [ + round(float(da.min().values), round_digits), + round(float(da.max().values), round_digits), + ], + } + if standard_name: + da.attrs["standard_name"] = standard_name + + +def _set_MVBS_attrs(ds): + """ + Attach common attributes. + + Parameters + ---------- + ds : xr.Dataset + dataset containing MVBS + """ + ds["ping_time"].attrs = { + "long_name": "Ping time", + "standard_name": "time", + "axis": "T", + } + + _set_var_attrs( + ds["Sv"], + long_name="Mean volume backscattering strength (MVBS, mean Sv re 1 m-1)", + units="dB", + round_digits=2, + ) + + +def _convert_bins_to_interval_index( + bins: list, closed: Literal["left", "right"] = "left" +) -> pd.IntervalIndex: + """ + Convert bins to sorted pandas IntervalIndex + with specified closed end + + Parameters + ---------- + bins : list + The bin edges + closed : {'left', 'right'}, default 'left' + Which side of bin interval is closed + + Returns + ------- + pd.IntervalIndex + The resulting IntervalIndex + """ + return pd.IntervalIndex.from_breaks(bins, closed=closed).sort_values() + + +def _parse_x_bin(x_bin: str, x_label="range_bin") -> float: + """ + Parses x bin string, check unit, + and returns x bin in the specified unit. + + Currently only available for: + range_bin: meters (m) + dist_bin: nautical miles (nmi) + + Parameters + ---------- + x_bin : str + X bin string, e.g., "0.5nmi" or "10m" + x_label : {"range_bin", "dist_bin"}, default "range_bin" + The label of the x bin. + + Returns + ------- + float + The resulting x bin value in x unit, + based on label. + + Raises + ------ + ValueError + If the x bin string doesn't include unit value. + TypeError + If the x bin is not a type string. + KeyError + If the x label is not one of the available labels. + """ + x_bin_map = { + "range_bin": { + "name": "Range bin", + "unit": "m", + "ex": "10m", + "unit_label": "meters", + "pattern": r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", + }, + "dist_bin": { + "name": "Distance bin", + "unit": "nmi", + "ex": "0.5nmi", + "unit_label": "nautical miles", + "pattern": r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(nmi)", + }, + } + x_bin_info = x_bin_map.get(x_label, None) + + if x_bin_info is None: + raise KeyError(f"x_label must be one of {list(x_bin_map.keys())}") + + # First check for bin types + if not isinstance(x_bin, str): + raise TypeError("'x_bin' must be a string") + # normalize to lower case + # for x_bin + x_bin = x_bin.strip().lower() + # Only matches meters + match_obj = re.match(x_bin_info["pattern"], x_bin) + + # Do some checks on x_bin inputs + if match_obj is None: + # This shouldn't be other units + raise ValueError( + f"{x_bin_info['name']} must be in " + f"{x_bin_info['unit_label']} " + f"(e.g., '{x_bin_info['ex']}')." + ) + + # Convert back to float + x_bin = float(match_obj.group(1)) + return x_bin + + +def _setup_and_validate( + ds_Sv: xr.Dataset, + range_var: Literal["echo_range", "depth"] = "echo_range", + range_bin: Optional[str] = None, + closed: Literal["left", "right"] = "left", + required_data_vars: Optional[list] = None, +) -> Tuple[xr.Dataset, float]: + """ + Setup and validate shared arguments for + ``compute_X`` functions. + + For now this is only used by ``compute_MVBS`` and ``compute_NASC``. + + Parameters + ---------- + ds_Sv : xr.Dataset + Sv dataset + range_var : {'echo_range', 'depth'}, default 'echo_range' + The variable to use for range binning. + Must be one of ``echo_range`` or ``depth``. + Note that ``depth`` is only available if the input dataset contains + ``depth`` as a data variable. + range_bin : str, default None + bin size along ``echo_range`` or ``depth`` in meters. + closed: {'left', 'right'}, default 'left' + Which side of bin interval is closed. + required_data_vars : list, optional + List of required data variables in ds_Sv. + If None, defaults to empty list. + + Returns + ------- + ds_Sv : xr.Dataset + Modified Sv dataset + range_bin : float + The range bin value in meters + """ + + # Check if range_var is valid + if range_var not in ["echo_range", "depth"]: + raise ValueError("range_var must be one of 'echo_range' or 'depth'.") + + # Set to default empty list if None + if required_data_vars is None: + required_data_vars = [] + + # Check if required data variables exists in ds_Sv + # Use set to ensure no duplicates + required_data_vars = set(required_data_vars + [range_var]) + if not all([var in ds_Sv.variables for var in required_data_vars]): + raise ValueError( + "Input Sv dataset must contain all of " f"the following variables: {required_data_vars}" + ) + + # Check if range_bin is a string + if not isinstance(range_bin, str): + raise TypeError("range_bin must be a string") + + # Parse the range_bin string and convert to float + range_bin = _parse_x_bin(range_bin, "range_bin") + + # Check for closed values + if closed not in ["right", "left"]: + raise ValueError(f"{closed} is not a valid option. Options are 'left' or 'right'.") + + # Clean up filenames dimension if it exists + # not needed here + if "filenames" in ds_Sv.dims: + ds_Sv = ds_Sv.drop_dims("filenames") + + return ds_Sv, range_bin + + +def _get_reduced_positions( + ds_Sv: xr.Dataset, + ds_X: xr.Dataset, + X: Literal["MVBS", "NASC"], + x_interval: Union[pd.IntervalIndex, np.ndarray], +) -> xr.Dataset: + """Helper function to get reduced positions + + Parameters + ---------- + ds_Sv : xr.Dataset + The input Sv dataset + ds_X : xr.Dataset + The input X dataset, either ``ds_MVBS`` or ``ds_NASC`` + X : {'MVBS', 'NASC'} + The type of X dataset + x_interval : pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for X dataset. + + MVBS: ``ping_time`` + NASC: ``distance_nmi`` + + Returns + ------- + xr.Dataset + The X dataset with reduced position variables + such as latitude and longitude + """ + # Get positions if exists + # otherwise return the input ds_X + if all(v in ds_Sv for v in POSITION_VARIABLES): + x_var = x_dim = "ping_time" + if X == "NASC": + x_var = "distance_nmi" + x_dim = "distance" + + ds_Pos = xarray_reduce( + ds_Sv[POSITION_VARIABLES], + ds_Sv[x_var], + func="nanmean", + expected_groups=(x_interval), + isbin=True, + method="map-reduce", + ) + + for var in POSITION_VARIABLES: + ds_X[var] = ([x_dim], ds_Pos[var].data, ds_Sv[var].attrs) + return ds_X + + +def _groupby_x_along_channels( + ds_Sv: xr.Dataset, + range_interval: Union[pd.IntervalIndex, np.ndarray], + x_interval: Union[pd.IntervalIndex, np.ndarray], + x_var: Literal["ping_time", "distance_nmi"] = "ping_time", + range_var: Literal["echo_range", "depth"] = "echo_range", + method: str = "map-reduce", + **flox_kwargs, +) -> xr.Dataset: + """ + Perform groupby of ``ds_Sv`` along each channel for the given + intervals to get the 'sv' mean value. + + Parameters + ---------- + ds_Sv : xr.Dataset + A Dataset containing ``Sv`` and other variables, + depending on computation performed. + + For MVBS computation, this must contain ``Sv`` and ``echo_range`` data + with coordinates ``channel``, ``ping_time``, and ``range_sample`` + at bare minimum. + Or this can contain ``Sv`` and ``depth`` data with similar coordinates. + + For NASC computatioon this must contain ``Sv`` and ``depth`` data + with coordinates ``channel``, ``distance_nmi``, and ``range_sample``. + range_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``range_var`` + x_interval : pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``ping_time`` or ``distance_nmi``. + x_var : {'ping_time', 'distance_nmi'}, default 'ping_time' + The variable to use for x binning. This will determine + if computation is for MVBS or NASC. + range_var: {'echo_range', 'depth'}, default 'echo_range' + The variable to use for range binning. + Either ``echo_range`` or ``depth``. + + **For NASC, this must be ``depth``.** + method: str + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + **flox_kwargs + Additional keyword arguments to be passed + to flox reduction function. + + Returns + ------- + xr.Dataset + The MVBS or NASC dataset of the input ``ds_Sv`` for all channels + """ + # Check if x_var is valid, currently only support + # ping_time and distance_nmi, which indicates + # either a MVBS or NASC computation + if x_var not in ["ping_time", "distance_nmi"]: + raise ValueError("x_var must be 'ping_time' or 'distance_nmi'") + + # Set correct range_var just in case + if x_var == "distance_nmi" and range_var != "depth": + logger.warning("x_var is 'distance_nmi', setting range_var to 'depth'") + range_var = "depth" + + # average should be done in linear domain + sv = ds_Sv["Sv"].pipe(_log2lin) + + # reduce along ping_time or distance_nmi + # and echo_range or depth + # by binning and averaging + sv_mean = xarray_reduce( + sv, + ds_Sv["channel"], + ds_Sv[x_var], + ds_Sv[range_var], + func="nanmean", + expected_groups=(None, x_interval, range_interval), + isbin=[False, True, True], + method=method, + **flox_kwargs, + ) + return sv_mean diff --git a/echopype/consolidate/api.py b/echopype/consolidate/api.py index a08144a5c..364c76be3 100644 --- a/echopype/consolidate/api.py +++ b/echopype/consolidate/api.py @@ -12,6 +12,8 @@ from ..utils.prov import add_processing_level from .split_beam_angle import add_angle_to_ds, get_angle_complex_samples, get_angle_power_samples +POSITION_VARIABLES = ["latitude", "longitude"] + def swap_dims_channel_frequency(ds: xr.Dataset) -> xr.Dataset: """ @@ -185,7 +187,7 @@ def sel_interp(var, time_dim_name): f"{datetime.datetime.utcnow()} +00:00. " "Interpolated or propagated from Platform latitude/longitude." # noqa ) - for da_name in ["latitude", "longitude"]: + for da_name in POSITION_VARIABLES: interp_ds[da_name] = interp_ds[da_name].assign_attrs({"history": history_attr}) if time_dim_name in interp_ds: diff --git a/echopype/convert/api.py b/echopype/convert/api.py index fbc9e8766..107af1bc0 100644 --- a/echopype/convert/api.py +++ b/echopype/convert/api.py @@ -17,6 +17,7 @@ from ..utils.coding import COMPRESSION_SETTINGS from ..utils.log import _init_logger from ..utils.prov import add_processing_level +from .utils.ek import should_use_swap BEAM_SUBGROUP_DEFAULT = "Beam_group1" @@ -315,6 +316,8 @@ def open_raw( convert_params: Optional[Dict[str, str]] = None, storage_options: Optional[Dict[str, str]] = None, use_swap: bool = False, + destination_path: Optional[str] = None, + destination_storage_options: Optional[Dict[str, str]] = None, max_mb: int = 100, ) -> Optional[EchoData]: """Create an EchoData object containing parsed data from a single raw data file. @@ -342,11 +345,21 @@ def open_raw( convert_params : dict parameters (metadata) that may not exist in the raw file and need to be added to the converted file - storage_options : dict + storage_options : dict, optional options for cloud storage use_swap: bool + **DEPRECATED: This flag is ignored** If True, variables with a large memory footprint will be written to a temporary zarr store at ``~/.echopype/temp_output/parsed2zarr_temp_files`` + destination_path: str + The path to a swap directory in a case of a large memory footprint. + This path can be a remote path like s3://bucket/swap_dir. + By default, it will create a temporary zarr store at + ``~/.echopype/temp_output/parsed2zarr_temp_files`` if needed, + when set to "auto". + destination_storage_options: dict, optional + Options for remote storage for the swap directory ``destination_path`` + argument. max_mb : int The maximum data chunk size in Megabytes (MB), when offloading variables with a large memory footprint to a temporary zarr store @@ -369,10 +382,23 @@ def open_raw( Notes ----- - ``use_swap=True`` is only available for the following + In a case of a large memory footprint, the program will determine if using + a temporary swap space is needed. If so, it will use that space + during conversion to prevent out of memory errors. + Users can override this behaviour by either passing ``"swap"`` or ``"no_swap"`` + into the ``destination_path`` argument. + This feature is only available for the following echosounders: EK60, ES70, EK80, ES80, EA640. Additionally, this feature is currently in beta. """ + + # Initially set use_swap False + use_swap = False + + # Set initial destination_path of "no_swap" + if destination_path is None: + destination_path = "no_swap" + if raw_file is None: raise FileNotFoundError("The path to the raw data file must be specified.") @@ -402,15 +428,6 @@ def open_raw( # Check file extension and existence file_chk, xml_chk = _check_file(raw_file, sonar_model, xml_path, storage_options) - # TODO: remove once 'auto' option is added - if not isinstance(use_swap, bool): - raise ValueError("use_swap must be of type bool.") - - # Ensure use_swap is 'auto', if it is a string - # TODO: use the following when we allow for 'auto' option - # if isinstance(use_swap, str) and use_swap != "auto": - # raise ValueError("use_swap must be a bool or equal to 'auto'.") - # TODO: the if-else below only works for the AZFP vs EK contrast, # but is brittle since it is abusing params by using it implicitly if SONAR_MODELS[sonar_model]["xml"]: @@ -423,29 +440,49 @@ def open_raw( # Parse raw file and organize data into groups parser = SONAR_MODELS[sonar_model]["parser"]( - file_chk, params=params, storage_options=storage_options, dgram_zarr_vars=dgram_zarr_vars + file_chk, + params=params, + storage_options=storage_options, + dgram_zarr_vars=dgram_zarr_vars, + sonar_model=sonar_model, ) parser.parse_raw() # Direct offload to zarr and rectangularization only available for some sonar models if sonar_model in ["EK60", "ES70", "EK80", "ES80", "EA640"]: - # Create sonar_model-specific p2z object - p2z = SONAR_MODELS[sonar_model]["parsed2zarr"](parser) - - # Determines if writing to zarr is necessary and writes to zarr - p2z_flag = use_swap is True or ( - use_swap == "auto" and p2z.whether_write_to_zarr(mem_mult=0.4) - ) - - if p2z_flag: - p2z.datagram_to_zarr(max_mb=max_mb) - # Rectangularize the transmit data - parser.rectangularize_transmit_ping_data(data_type="complex") + swap_map = { + "swap": True, + "no_swap": False, + } + if destination_path == "auto": + # Overwrite use_swap if it's True below + # Use local swap directory + use_swap = should_use_swap(parser.zarr_datagrams, dgram_zarr_vars, mem_mult=0.4) + elif destination_path in swap_map: + use_swap = swap_map[destination_path] + else: + # TODO: Add docstring about swap path + use_swap = True + if "://" in destination_path and destination_storage_options is None: + raise ValueError( + ( + "Please provide storage options for remote destination. ", + "If access is already configured locally, ", + "simply put an empty dictionary.", + ) + ) + + if use_swap: + # Create sonar_model-specific p2z object + p2z = SONAR_MODELS[sonar_model]["parsed2zarr"](parser) + p2z.datagram_to_zarr( + dest_path=destination_path, + dest_storage_options=destination_storage_options, + max_mb=max_mb, + ) else: - del p2z - # Create general p2z object - p2z = Parsed2Zarr(parser) + p2z = Parsed2Zarr(parser) # Create general p2z object parser.rectangularize_data() else: diff --git a/echopype/convert/parse_ad2cp.py b/echopype/convert/parse_ad2cp.py index 3b5fcea70..6c5c796fd 100644 --- a/echopype/convert/parse_ad2cp.py +++ b/echopype/convert/parse_ad2cp.py @@ -219,8 +219,8 @@ class NoMorePackets(Exception): class ParseAd2cp(ParseBase): - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, storage_options) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="AD2CP"): + super().__init__(file, storage_options, sonar_model) self.config = None self.packets: List[Ad2cpDataPacket] = [] diff --git a/echopype/convert/parse_azfp.py b/echopype/convert/parse_azfp.py index a890068a7..ae9f928e2 100644 --- a/echopype/convert/parse_azfp.py +++ b/echopype/convert/parse_azfp.py @@ -1,5 +1,5 @@ import os -import xml.dom.minidom +import xml.etree.ElementTree as ET from collections import defaultdict from datetime import datetime as dt from struct import unpack @@ -8,48 +8,11 @@ import numpy as np from ..utils.log import _init_logger +from ..utils.misc import camelcase2snakecase from .parse_base import ParseBase FILENAME_DATETIME_AZFP = "\\w+.01A" -XML_INT_PARAMS = { - "NumFreq": "num_freq", - "SerialNumber": "serial_number", - "BurstInterval": "burst_interval", - "PingsPerBurst": "pings_per_burst", - "AverageBurstPings": "average_burst_pings", - "SensorsFlag": "sensors_flag", -} -XML_FLOAT_PARAMS = [ - # Temperature coeffs - "ka", - "kb", - "kc", - "A", - "B", - "C", - # Tilt coeffs - "X_a", - "X_b", - "X_c", - "X_d", - "Y_a", - "Y_b", - "Y_c", - "Y_d", -] -XML_FREQ_PARAMS = { - "RangeSamples": "range_samples", - "RangeAveragingSamples": "range_averaging_samples", - "DigRate": "dig_rate", - "LockOutIndex": "lockout_index", - "Gain": "gain", - "PulseLen": "pulse_length", - "DS": "DS", - "EL": "EL", - "TVR": "TVR", - "VTX0": "VTX", - "BP": "BP", -} + HEADER_FIELDS = ( ("profile_flag", "u2"), ("profile_number", "u2"), @@ -64,7 +27,7 @@ ("second", "u2"), # Second ("hundredths", "u2"), # Hundredths of a second ("dig_rate", "u2", 4), # Digitalization rate for each channel - ("lockout_index", "u2", 4), # Lockout index for each channel + ("lock_out_index", "u2", 4), # Lockout index for each channel ("num_bins", "u2", 4), # Number of bins for each channel ( "range_samples_per_bin", @@ -88,7 +51,7 @@ ("num_chan", "u1"), # 1, 2, 3, or 4 ("gain", "u1", 4), # gain channel 1-4 ("spare_chan", "u1"), # spare channel - ("pulse_length", "u2", 4), # Pulse length chan 1-4 uS + ("pulse_len", "u2", 4), # Pulse length chan 1-4 uS ("board_num", "u2", 4), # The board the data came from channel 1-4 ("frequency", "u2", 4), # frequency for channel 1-4 in kHz ( @@ -110,42 +73,58 @@ class ParseAZFP(ParseBase): HEADER_FORMAT = ">HHHHIHHHHHHHHHHHHHHHHHHHHHHHHHHHHHBBBBHBBBBBBBBHHHHHHHHHHHHHHHHHHHH" FILE_TYPE = 64770 - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, storage_options) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="AZFP"): + super().__init__(file, storage_options, sonar_model) # Parent class attributes # regex pattern used to grab datetime embedded in filename self.timestamp_pattern = FILENAME_DATETIME_AZFP self.xml_path = params # Class attributes - self.parameters = dict() + self.parameters = defaultdict(list) self.unpacked_data = defaultdict(list) self.sonar_type = "AZFP" def load_AZFP_xml(self): - """Parse XML file to get params for reading AZFP data.""" - """Parses the AZFP XML file. """ - - def get_value_by_tag_name(tag_name, element=0): - """Returns the value in an XML tag given the tag name and the number of occurrences.""" - return px.getElementsByTagName(tag_name)[element].childNodes[0].data + Parses the AZFP XML file. + """ xmlmap = fsspec.get_mapper(self.xml_path, **self.storage_options) - px = xml.dom.minidom.parse(xmlmap.fs.open(xmlmap.root)) - - # Retrieve integer parameters from the xml file - for old_name, new_name in XML_INT_PARAMS.items(): - self.parameters[new_name] = int(get_value_by_tag_name(old_name)) - # Retrieve floating point parameters from the xml file - for param in XML_FLOAT_PARAMS: - self.parameters[param] = float(get_value_by_tag_name(param)) - # Retrieve frequency dependent parameters from the xml file - for old_name, new_name in XML_FREQ_PARAMS.items(): - self.parameters[new_name] = [ - float(get_value_by_tag_name(old_name, ch)) - for ch in range(self.parameters["num_freq"]) - ] + phase_number = None + for event, child in ET.iterparse(xmlmap.fs.open(xmlmap.root), events=("start", "end")): + if event == "end" and child.tag == "Phases": + phase_number = None + if event == "start": + if len(child.tag) > 3 and not child.tag.startswith("VTX"): + camel_case_tag = camelcase2snakecase(child.tag) + else: + camel_case_tag = child.tag + + if len(child.attrib) > 0: + for key, val in child.attrib.items(): + attrib_tag = camel_case_tag + "_" + camelcase2snakecase(key) + if phase_number is not None and camel_case_tag != "phase": + attrib_tag += f"_phase{phase_number}" + self.parameters[attrib_tag].append(val) + if child.tag == "Phase": + phase_number = val + + if all(char == "\n" for char in child.text): + continue + else: + try: + val = int(child.text) + except ValueError: + val = float(child.text) + if phase_number is not None and camel_case_tag != "phase": + camel_case_tag += f"_phase{phase_number}" + self.parameters[camel_case_tag].append(val) + + # Handling the case where there is only one value for each parameter + for key, val in self.parameters.items(): + if len(val) == 1: + self.parameters[key] = val[0] def _compute_temperature(self, ping_num, is_valid): """ @@ -218,6 +197,25 @@ def _compute_battery(self, ping_num, battery_type): return N * USL5_BAT_CONSTANT + def _compute_pressure(self, ping_num, is_valid): + """ + Compute pressure in decibar + + Parameters + ---------- + ping_num + ping number + is_valid + whether the associated parameters have valid values + """ + if not is_valid or self.parameters["sensors_flag_pressure_sensor_installed"] == "no": + return np.nan + + counts = self.unpacked_data["ancillary"][ping_num][3] + v_in = 2.5 * (counts / 65535) + P = v_in * self.parameters["a1"] + self.parameters["a0"] - 10.125 + return P + def parse_raw(self): """ Parse raw data file from AZFP echosounder. @@ -235,6 +233,7 @@ def _test_valid_params(params): return True temperature_is_valid = _test_valid_params(["ka", "kb", "kc"]) + pressure_is_valid = _test_valid_params(["a0", "a1"]) tilt_x_is_valid = _test_valid_params(["X_a", "X_b", "X_c"]) tilt_y_is_valid = _test_valid_params(["Y_a", "Y_b", "Y_c"]) @@ -245,7 +244,6 @@ def _test_valid_params(params): header_chunk = file.read(self.HEADER_SIZE) if header_chunk: header_unpacked = unpack(self.HEADER_FORMAT, header_chunk) - # Reading will stop if the file contains an unexpected flag if self._split_header(file, header_unpacked): # Appends the actual 'data values' to unpacked_data @@ -257,6 +255,10 @@ def _test_valid_params(params): self.unpacked_data["temperature"].append( self._compute_temperature(ping_num, temperature_is_valid) ) + # Compute pressure from unpacked_data[ii]['ancillary'][3] + self.unpacked_data["pressure"].append( + self._compute_pressure(ping_num, pressure_is_valid) + ) # compute x tilt from unpacked_data[ii]['ancillary][0] self.unpacked_data["tilt_x"].append( self._compute_tilt(ping_num, "X", tilt_x_is_valid) @@ -354,12 +356,12 @@ def _split_header(self, raw, header_unpacked): field_w_freq = ( "dig_rate", - "lockout_index", + "lock_out_index", "num_bins", "range_samples_per_bin", # fields with num_freq data "data_type", "gain", - "pulse_length", + "pulse_len", "board_num", "frequency", ) @@ -417,12 +419,12 @@ def _check_uniqueness(self): # fields with num_freq data field_w_freq = ( "dig_rate", - "lockout_index", + "lock_out_index", "num_bins", "range_samples_per_bin", "data_type", "gain", - "pulse_length", + "pulse_len", "board_num", "frequency", ) @@ -478,22 +480,22 @@ def _get_ping_time(self): self.ping_time = ping_time @staticmethod - def _calc_Sv_offset(f, pulse_length): + def _calc_Sv_offset(f, pulse_len): """Calculate the compensation factor for Sv calculation.""" # TODO: this method seems should be in echopype.process if f > 38000: - if pulse_length == 300: + if pulse_len == 300: return 1.1 - elif pulse_length == 500: + elif pulse_len == 500: return 0.8 - elif pulse_length == 700: + elif pulse_len == 700: return 0.5 - elif pulse_length == 900: + elif pulse_len == 900: return 0.3 - elif pulse_length == 1000: + elif pulse_len == 1000: return 0.3 else: - if pulse_length == 500: + if pulse_len == 500: return 1.1 - elif pulse_length == 1000: + elif pulse_len == 1000: return 0.7 diff --git a/echopype/convert/parse_base.py b/echopype/convert/parse_base.py index e14ae276a..3f4d960e4 100644 --- a/echopype/convert/parse_base.py +++ b/echopype/convert/parse_base.py @@ -17,12 +17,14 @@ class ParseBase: """Parent class for all convert classes.""" - def __init__(self, file, storage_options): + def __init__(self, file, storage_options, sonar_model): self.source_file = file self.timestamp_pattern = None # regex pattern used to grab datetime embedded in filename self.ping_time = [] # list to store ping time self.storage_options = storage_options self.zarr_datagrams = [] # holds all parsed datagrams + self.zarr_tx_datagrams = [] # holds all parsed transmit datagrams + self.sonar_model = sonar_model def _print_status(self): """Prints message to console giving information about the raw file being parsed.""" @@ -31,8 +33,8 @@ def _print_status(self): class ParseEK(ParseBase): """Class for converting data from Simrad echosounders.""" - def __init__(self, file, params, storage_options, dgram_zarr_vars): - super().__init__(file, storage_options) + def __init__(self, file, params, storage_options, dgram_zarr_vars, sonar_model): + super().__init__(file, storage_options, sonar_model) # Parent class attributes # regex pattern used to grab datetime embedded in filename @@ -78,6 +80,10 @@ def rectangularize_data(self): for dgram in self.zarr_datagrams: self._append_channel_ping_data(dgram, zarr_vars=False) + # append zarr transmit datagrams to channel ping data + for dgram in self.zarr_tx_datagrams: + self._append_channel_ping_data(dgram, rx=False, zarr_vars=False) + # Rectangularize all data and convert to numpy array indexed by channel for data_type in ["power", "angle", "complex"]: # Receive data @@ -350,7 +356,7 @@ def _read_datagrams(self, fid): else: logger.info("Unknown datagram type: " + str(new_datagram["type"])) - def _append_zarr_dgram(self, full_dgram: dict): + def _append_zarr_dgram(self, full_dgram: dict, rx: bool): """ Selects a subset of the datagram values that need to be sent directly to a zarr file and @@ -389,7 +395,10 @@ def _append_zarr_dgram(self, full_dgram: dict): reduced_datagram["power"] = reduced_datagram["power"].astype("float32") * INDEX2POWER if reduced_datagram: - self.zarr_datagrams.append(reduced_datagram) + if rx: + self.zarr_datagrams.append(reduced_datagram) + else: + self.zarr_tx_datagrams.append(reduced_datagram) def _append_channel_ping_data(self, datagram, rx=True, zarr_vars=True): """ @@ -411,10 +420,10 @@ def _append_channel_ping_data(self, datagram, rx=True, zarr_vars=True): ch_id = datagram["channel_id"] if "channel_id" in datagram else datagram["channel"] # append zarr variables, if they exist - if zarr_vars and rx: + if zarr_vars: common_vars = set(self.dgram_zarr_vars.keys()).intersection(set(datagram.keys())) if common_vars: - self._append_zarr_dgram(datagram) + self._append_zarr_dgram(datagram, rx=rx) for var in common_vars: del datagram[var] diff --git a/echopype/convert/parse_ek60.py b/echopype/convert/parse_ek60.py index 9502c3214..a8666a386 100644 --- a/echopype/convert/parse_ek60.py +++ b/echopype/convert/parse_ek60.py @@ -4,8 +4,8 @@ class ParseEK60(ParseEK): """Class for converting data from Simrad EK60 echosounders.""" - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, params, storage_options, dgram_zarr_vars) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="EK60"): + super().__init__(file, params, storage_options, dgram_zarr_vars, sonar_model) def _select_datagrams(self, params): # Translates user input into specific datagrams or ALL diff --git a/echopype/convert/parse_ek80.py b/echopype/convert/parse_ek80.py index 55027ffe3..583fac8c4 100644 --- a/echopype/convert/parse_ek80.py +++ b/echopype/convert/parse_ek80.py @@ -4,8 +4,8 @@ class ParseEK80(ParseEK): """Class for converting data from Simrad EK80 echosounders.""" - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, params, storage_options, dgram_zarr_vars) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="EK80"): + super().__init__(file, params, storage_options, dgram_zarr_vars, sonar_model) self.environment = {} # dictionary to store environment data def _select_datagrams(self, params): diff --git a/echopype/convert/parsed_to_zarr.py b/echopype/convert/parsed_to_zarr.py index 95fde46c9..9e8a302a9 100644 --- a/echopype/convert/parsed_to_zarr.py +++ b/echopype/convert/parsed_to_zarr.py @@ -1,14 +1,28 @@ import secrets import sys -from pathlib import Path -from typing import List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union -import more_itertools as miter +import dask.array +import fsspec import numpy as np import pandas as pd +import xarray as xr import zarr -from ..utils.io import ECHOPYPE_DIR, check_file_permissions +from ..echodata.convention import sonarnetcdf_1 +from ..utils.io import ECHOPYPE_DIR, check_file_permissions, validate_output_path + +DEFAULT_ZARR_TEMP_DIR = ECHOPYPE_DIR / "temp_output" / "parsed2zarr_temp_files" + + +def _create_zarr_store_map(path, storage_options): + file_path = validate_output_path( + source_file=secrets.token_hex(16), + engine="zarr", + save_path=path, + output_storage_options=storage_options, + ) + return fsspec.get_mapper(file_path, **storage_options) class Parsed2Zarr: @@ -26,53 +40,63 @@ def __init__(self, parser_obj): self.zarr_root = None self.parser_obj = parser_obj # parser object ParseEK60/ParseEK80/etc. - def _create_zarr_info(self): + self._varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] + + if hasattr(self.parser_obj, "sonar_model"): + self.sonar_model = self.parser_obj.sonar_model + + def _create_zarr_info( + self, dest_path: str = None, dest_storage_options: Optional[Dict] = None, retries: int = 10 + ): """ Creates the temporary directory for zarr storage, zarr file name, zarr store, and the root group of the zarr store. """ - # get current working directory - current_dir = Path.cwd() - - # Check permission of cwd, raise exception if no permission - check_file_permissions(current_dir) + if dest_path is None: + # Check permission of cwd, raise exception if no permission + check_file_permissions(ECHOPYPE_DIR) - # construct temporary directory that will hold the zarr file - out_dir = current_dir / ECHOPYPE_DIR / "temp_output" / "parsed2zarr_temp_files" - if not out_dir.exists(): - out_dir.mkdir(parents=True) + # construct temporary directory that will hold the zarr file + dest_path = DEFAULT_ZARR_TEMP_DIR + if not dest_path.exists(): + dest_path.mkdir(parents=True) # establish temporary directory we will write zarr files to - self.temp_zarr_dir = str(out_dir) - - # create zarr store name - zarr_file_name = str(out_dir / secrets.token_hex(16)) + ".zarr" - - # attempt to find different zarr_file_name, if it already exists - count = 0 - while Path(zarr_file_name).exists() and count < 10: - # generate new zarr_file_name - zarr_file_name = str(out_dir / secrets.token_hex(16)) + ".zarr" - count += 1 - - # error out if we are unable to get a unique name, else assign name to class variable - if (count == 10) and Path(zarr_file_name).exists(): - raise RuntimeError("Unable to construct an unused zarr file name for Parsed2Zarr!") - else: - self.zarr_file_name = zarr_file_name + self.temp_zarr_dir = str(dest_path) + + # Set default storage options if None + if dest_storage_options is None: + dest_storage_options = {} + + # attempt to find different zarr_file_name + attempt = 0 + exists = True + while exists: + zarr_store = _create_zarr_store_map( + path=dest_path, storage_options=dest_storage_options + ) + exists = zarr_store.fs.exists(zarr_store.root) + attempt += 1 + + if attempt == retries and exists: + raise RuntimeError( + ( + "Unable to construct an unused zarr file name for Parsed2Zarr ", + f"after {retries} retries!", + ) + ) # create zarr store and zarr group we want to write to - self.store = zarr.DirectoryStore(self.zarr_file_name) + self.store = zarr_store self.zarr_root = zarr.group(store=self.store, overwrite=True) def _close_store(self): """properly closes zarr store""" - # consolidate metadata and close zarr store + # consolidate metadata zarr.consolidate_metadata(self.store) - self.store.close() @staticmethod def set_multi_index( @@ -383,7 +407,7 @@ def write_df_column( # evenly chunk unique times so that the smallest and largest # chunk differ by at most 1 element - chunks = list(miter.chunked_even(unique_time_ind, max_num_times)) + chunks = np.array_split(unique_time_ind, np.ceil(len(unique_time_ind) / max_num_times)) self.write_chunks(pd_series, zarr_grp, is_array, chunks, chunk_shape) @@ -399,6 +423,225 @@ def _get_zarr_dgrams_size(self) -> int: return size + def _get_channel_ids(self, chan_str: np.ndarray) -> List[str]: + """ + Obtains the channel IDs associated with ``chan_str``. + + Parameters + ---------- + chan_str : np.ndarray + A numpy array of strings corresponding to the + keys of ``config_datagram["transceivers"]`` + + Returns + ------- + A list of strings representing the channel IDS + """ + if self.sonar_model in ["EK60", "ES70"]: + return [ + self.parser_obj.config_datagram["transceivers"][int(i)]["channel_id"] + for i in chan_str + ] + else: + return [ + self.parser_obj.config_datagram["configuration"][i]["channel_id"] for i in chan_str + ] + + @property + def power_dataarray(self) -> xr.DataArray: + """ + Constructs a DataArray from a Dask array for the power + data. + + Returns + ------- + DataArray named "backscatter_r" representing the + power data. + """ + zarr_path = self.store + + # collect variables associated with the power data + power = dask.array.from_zarr(zarr_path, component="power/power") + + pow_time_path = "power/" + self.power_dims[0] + pow_chan_path = "power/" + self.power_dims[1] + power_time = dask.array.from_zarr(zarr_path, component=pow_time_path).compute() + power_channel = dask.array.from_zarr(zarr_path, component=pow_chan_path).compute() + + # obtain channel names for power data + pow_chan_names = self._get_channel_ids(power_channel) + + backscatter_r = xr.DataArray( + data=power, + coords={ + "ping_time": ( + ["ping_time"], + power_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + pow_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "range_sample": ( + ["range_sample"], + np.arange(power.shape[2]), + self._varattrs["beam_coord_default"]["range_sample"], + ), + }, + name="backscatter_r", + attrs={ + "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], + "units": "dB", + }, + ) + + return backscatter_r + + @property + def angle_dataarrays(self) -> Tuple[xr.DataArray, xr.DataArray]: + """ + Constructs the DataArrays from Dask arrays associated + with the angle data. + + Returns + ------- + DataArrays named "angle_athwartship" and "angle_alongship", + respectively, representing the angle data. + """ + + zarr_path = self.store + + # collect variables associated with the angle data + angle_along = dask.array.from_zarr(zarr_path, component="angle/angle_alongship") + angle_athwart = dask.array.from_zarr(zarr_path, component="angle/angle_athwartship") + + ang_time_path = "angle/" + self.angle_dims[0] + ang_chan_path = "angle/" + self.angle_dims[1] + angle_time = dask.array.from_zarr(zarr_path, component=ang_time_path).compute() + angle_channel = dask.array.from_zarr(zarr_path, component=ang_chan_path).compute() + + # obtain channel names for angle data + ang_chan_names = self._get_channel_ids(angle_channel) + + array_coords = { + "ping_time": ( + ["ping_time"], + angle_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + ang_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "range_sample": ( + ["range_sample"], + np.arange(angle_athwart.shape[2]), + self._varattrs["beam_coord_default"]["range_sample"], + ), + } + + angle_athwartship = xr.DataArray( + data=angle_athwart, + coords=array_coords, + name="angle_athwartship", + attrs={ + "long_name": "electrical athwartship angle", + "comment": ( + "Introduced in echopype for Simrad echosounders. " # noqa + + "The athwartship angle corresponds to the major angle in SONAR-netCDF4 vers 2. " # noqa + ), + }, + ) + + angle_alongship = xr.DataArray( + data=angle_along, + coords=array_coords, + name="angle_alongship", + attrs={ + "long_name": "electrical alongship angle", + "comment": ( + "Introduced in echopype for Simrad echosounders. " # noqa + + "The alongship angle corresponds to the minor angle in SONAR-netCDF4 vers 2. " # noqa + ), + }, + ) + + return angle_athwartship, angle_alongship + + @property + def complex_dataarrays(self) -> Tuple[xr.DataArray, xr.DataArray]: + """ + Constructs the DataArrays from Dask arrays associated + with the complex data. + + Returns + ------- + DataArrays named "backscatter_r" and "backscatter_i", + respectively, representing the complex data. + """ + + zarr_path = self.store + + # collect variables associated with the complex data + complex_r = dask.array.from_zarr(zarr_path, component="complex/backscatter_r") + complex_i = dask.array.from_zarr(zarr_path, component="complex/backscatter_i") + + comp_time_path = "complex/" + self.complex_dims[0] + comp_chan_path = "complex/" + self.complex_dims[1] + complex_time = dask.array.from_zarr(zarr_path, component=comp_time_path).compute() + complex_channel = dask.array.from_zarr(zarr_path, component=comp_chan_path).compute() + + # obtain channel names for complex data + comp_chan_names = self._get_channel_ids(complex_channel) + + array_coords = { + "ping_time": ( + ["ping_time"], + complex_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + comp_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "range_sample": ( + ["range_sample"], + np.arange(complex_r.shape[2]), + self._varattrs["beam_coord_default"]["range_sample"], + ), + "beam": ( + ["beam"], + np.arange(start=1, stop=complex_r.shape[3] + 1).astype(str), + self._varattrs["beam_coord_default"]["beam"], + ), + } + + backscatter_r = xr.DataArray( + data=complex_r, + coords=array_coords, + name="backscatter_r", + attrs={ + "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], + "units": "dB", + }, + ) + + backscatter_i = xr.DataArray( + data=complex_i, + coords=array_coords, + name="backscatter_i", + attrs={ + "long_name": self._varattrs["beam_var_default"]["backscatter_i"]["long_name"], + "units": "dB", + }, + ) + + return backscatter_r, backscatter_i + def array_series_bytes(self, pd_series: pd.Series, n_rows: int) -> int: """ Determines the amount of bytes required for a diff --git a/echopype/convert/parsed_to_zarr_ek60.py b/echopype/convert/parsed_to_zarr_ek60.py index 17f80c8a0..1568b237e 100644 --- a/echopype/convert/parsed_to_zarr_ek60.py +++ b/echopype/convert/parsed_to_zarr_ek60.py @@ -253,7 +253,7 @@ def whether_write_to_zarr(self, mem_mult: float = 0.3) -> bool: return mem.total * mem_mult < req_mem - def datagram_to_zarr(self, max_mb: int) -> None: + def datagram_to_zarr(self, dest_path: str, dest_storage_options: dict, max_mb: int) -> None: """ Facilitates the conversion of a list of datagrams to a form that can be written @@ -275,7 +275,7 @@ def datagram_to_zarr(self, max_mb: int) -> None: the same. """ - self._create_zarr_info() + self._create_zarr_info(dest_path=dest_path, dest_storage_options=dest_storage_options) # create datagram df, if it does not exist if not isinstance(self.datagram_df, pd.DataFrame): diff --git a/echopype/convert/parsed_to_zarr_ek80.py b/echopype/convert/parsed_to_zarr_ek80.py index 468b1f6b0..19b117668 100644 --- a/echopype/convert/parsed_to_zarr_ek80.py +++ b/echopype/convert/parsed_to_zarr_ek80.py @@ -1,6 +1,11 @@ +from typing import Optional, Tuple + +import dask.array import numpy as np import pandas as pd import psutil +import xarray as xr +from zarr.errors import ArrayNotFoundError from .parsed_to_zarr_ek60 import Parsed2ZarrEK60 @@ -20,6 +25,7 @@ def __init__(self, parser_obj): self.p2z_ch_ids = {} # channel ids for power, angle, complex self.pow_ang_df = None # df that holds power and angle data self.complex_df = None # df that holds complex data + self.tx_df = None # df that holds transmit data # get channel and channel_id association and sort by channel_id channels_old = list(self.parser_obj.config_datagram["configuration"].keys()) @@ -31,6 +37,78 @@ def __init__(self, parser_obj): # obtain sort rule for the channel index self.channel_sort_rule = {ch: channels_new.index(ch) for ch in channels_old} + @property + def tx_complex_dataarrays(self) -> Optional[Tuple[xr.DataArray, xr.DataArray]]: + """ + Constructs the DataArrays from Dask arrays associated + with the transmit complex data (RAW4). + + Returns + ------- + DataArrays named "transmit_pulse_r" and "transmit_pulse_i", + respectively, representing the complex data. + """ + try: + zarr_path = self.store + + # collect variables associated with the complex data + complex_r = dask.array.from_zarr(zarr_path, component="tx_complex/transmit_pulse_r") + complex_i = dask.array.from_zarr(zarr_path, component="tx_complex/transmit_pulse_i") + + comp_time_path = "tx_complex/" + self.complex_dims[0] + comp_chan_path = "tx_complex/" + self.complex_dims[1] + complex_time = dask.array.from_zarr(zarr_path, component=comp_time_path).compute() + complex_channel = dask.array.from_zarr(zarr_path, component=comp_chan_path).compute() + + # obtain channel names for complex data + comp_chan_names = self._get_channel_ids(complex_channel) + + array_coords = { + "ping_time": ( + ["ping_time"], + complex_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + comp_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "transmit_sample": ( + ["transmit_sample"], + np.arange(complex_r.shape[2]), + { + "long_name": "Transmit pulse sample number, base 0", + "comment": "Only exist for Simrad EK80 file with RAW4 datagrams", + }, + ), + } + + transmit_pulse_r = xr.DataArray( + data=complex_r, + coords=array_coords, + name="transmit_pulse_r", + attrs={ + "long_name": "Real part of the transmit pulse", + "units": "V", + "comment": "Only exist for Simrad EK80 file with RAW4 datagrams", + }, + ) + + transmit_pulse_i = xr.DataArray( + data=complex_i, + coords=array_coords, + name="transmit_pulse_i", + attrs={ + "long_name": "Imaginary part of the transmit pulse", + "units": "V", + "comment": "Only exist for Simrad EK80 file with RAW4 datagrams", + }, + ) + return transmit_pulse_r, transmit_pulse_i + except ArrayNotFoundError: + return None + def _get_num_transd_sec(self, x: pd.DataFrame): """ Returns the number of transducer sectors. @@ -86,7 +164,7 @@ def _reshape_series(self, complex_series: pd.Series) -> pd.Series: ) @staticmethod - def _split_complex_data(complex_series: pd.Series) -> pd.DataFrame: + def _split_complex_data(complex_series: pd.Series, rx: bool = True) -> pd.DataFrame: """ Splits the 1D complex data into two 1D arrays representing the real and imaginary parts of @@ -105,6 +183,9 @@ def _split_complex_data(complex_series: pd.Series) -> pd.DataFrame: respectively. The DataFrame will have the same index as ``complex_series``. """ + columns = ["backscatter_r", "backscatter_i"] + if not rx: + columns = ["transmit_pulse_r", "transmit_pulse_i"] complex_split = complex_series.apply( lambda x: [np.real(x), np.imag(x)] if isinstance(x, np.ndarray) else [None, None] @@ -112,11 +193,11 @@ def _split_complex_data(complex_series: pd.Series) -> pd.DataFrame: return pd.DataFrame( data=complex_split.to_list(), - columns=["backscatter_r", "backscatter_i"], + columns=columns, index=complex_series.index, ) - def _write_complex(self, df: pd.DataFrame, max_mb: int): + def _write_complex(self, df: pd.DataFrame, max_mb: int, rx: bool = True): """ Writes the complex data and associated indices to a zarr group. @@ -142,11 +223,15 @@ def _write_complex(self, df: pd.DataFrame, max_mb: int): ) channels = channels[indexer] - complex_series = self._reshape_series(complex_series) + if rx: + complex_series = self._reshape_series(complex_series) + grp_key = "complex" + else: + grp_key = "tx_complex" - complex_df = self._split_complex_data(complex_series) + self.p2z_ch_ids[grp_key] = channels.values # store channel ids for variable - self.p2z_ch_ids["complex"] = channels.values # store channel ids for variable + complex_df = self._split_complex_data(complex_series, rx=rx) # create multi index using the product of the unique dims unique_dims = [times, channels] @@ -154,7 +239,11 @@ def _write_complex(self, df: pd.DataFrame, max_mb: int): complex_df = self.set_multi_index(complex_df, unique_dims) # write complex data to the complex group - zarr_grp = self.zarr_root.create_group("complex") + if rx: + zarr_grp = self.zarr_root.create_group(grp_key) + else: + zarr_grp = self.zarr_root.create_group(grp_key) + for column in complex_df: self.write_df_column( pd_series=complex_df[column], @@ -219,6 +308,17 @@ def _get_zarr_dfs(self): self.complex_df = datagram_df.dropna().copy() + def _get_tx_zarr_df(self) -> None: + """Create dataframe for the transmit data.""" + + tx_datagram_df = pd.DataFrame.from_dict(self.parser_obj.zarr_tx_datagrams) + # remove power and angle to conserve memory + for col in ["power", "angle"]: + if col in tx_datagram_df.columns: + del tx_datagram_df[col] + + self.tx_df = tx_datagram_df.dropna().copy() + def whether_write_to_zarr(self, mem_mult: float = 0.3) -> bool: """ Determines if the zarr data provided will expand @@ -266,7 +366,7 @@ def whether_write_to_zarr(self, mem_mult: float = 0.3) -> bool: return mem.total * mem_mult < req_mem - def datagram_to_zarr(self, max_mb: int) -> None: + def datagram_to_zarr(self, dest_path: str, dest_storage_options: dict, max_mb: int) -> None: """ Facilitates the conversion of a list of datagrams to a form that can be written @@ -288,7 +388,7 @@ def datagram_to_zarr(self, max_mb: int) -> None: the same. """ - self._create_zarr_info() + self._create_zarr_info(dest_path=dest_path, dest_storage_options=dest_storage_options) # create zarr dfs, if they do not exist if not isinstance(self.pow_ang_df, pd.DataFrame) and not isinstance( @@ -308,4 +408,14 @@ def datagram_to_zarr(self, max_mb: int) -> None: del self.complex_df # free memory + # write transmit data + if not isinstance(self.tx_df, pd.DataFrame): + self._get_tx_zarr_df() + del self.parser_obj.zarr_tx_datagrams # free memory + + if not self.tx_df.empty: + self._write_complex(df=self.tx_df, max_mb=max_mb, rx=False) + + del self.tx_df # free memory + self._close_store() diff --git a/echopype/convert/set_groups_azfp.py b/echopype/convert/set_groups_azfp.py index 57f754ce9..7dc3fdf21 100644 --- a/echopype/convert/set_groups_azfp.py +++ b/echopype/convert/set_groups_azfp.py @@ -81,13 +81,14 @@ def _create_unique_channel_name(self): """ serial_number = self.parser_obj.unpacked_data["serial_number"] + frequency_number = self.parser_obj.parameters["frequency_number_phase1"] if serial_number.size == 1: freq_as_str = self.freq_sorted.astype(int).astype(str) # TODO: replace str(i+1) with Frequency Number from XML channel_id = [ - str(serial_number) + "-" + freq + "-" + str(i + 1) + str(serial_number) + "-" + freq + "-" + frequency_number[i] for i, freq in enumerate(freq_as_str) ] @@ -113,7 +114,16 @@ def set_env(self) -> xr.Dataset: "standard_name": "sea_water_temperature", "units": "deg_C", }, - ) + ), + "pressure": ( + ["time1"], + self.parser_obj.unpacked_data["pressure"], + { + "long_name": "Sea water pressure", + "standard_name": "sea_water_pressure_due_to_sea_water", + "units": "dbar", + }, + ), }, coords={ "time1": ( @@ -146,8 +156,7 @@ def set_sonar(self) -> xr.Dataset: "sonar_model": self.sonar_model, "sonar_serial_number": int(self.parser_obj.unpacked_data["serial_number"]), "sonar_software_name": "AZFP", - # TODO: software version is hardwired. Read it from the XML file's AZFP_Version node - "sonar_software_version": "1.4", + "sonar_software_version": "based on AZFP Matlab version 1.4", "sonar_type": "echosounder", } ds = ds.assign_attrs(sonar_attr_dict) @@ -320,7 +329,7 @@ def set_beam(self) -> List[xr.Dataset]: del N_tmp tdn = ( - unpacked_data["pulse_length"][self.freq_ind_sorted] / 1e6 + unpacked_data["pulse_len"][self.freq_ind_sorted] / 1e6 ) # Convert microseconds to seconds range_samples_per_bin = unpacked_data["range_samples_per_bin"][ self.freq_ind_sorted @@ -500,7 +509,25 @@ def set_vendor(self) -> xr.Dataset: unpacked_data = self.parser_obj.unpacked_data parameters = self.parser_obj.parameters ping_time = self.parser_obj.ping_time - tdn = parameters["pulse_length"][self.freq_ind_sorted] / 1e6 + phase_params = ["burst_interval", "pings_per_burst", "average_burst_pings"] + phase_freq_params = [ + "dig_rate", + "range_samples", + "range_averaging_samples", + "lock_out_index", + "gain", + "storage_format", + ] + tdn = [] + for num in parameters["phase_number"]: + tdn.append(parameters[f"pulse_len_phase{num}"][self.freq_ind_sorted] / 1e6) + tdn = np.array(tdn) + for param in phase_freq_params: + for num in parameters["phase_number"]: + parameters[param].append(parameters[f"{param}_phase{num}"][self.freq_ind_sorted]) + for param in phase_params: + for num in parameters["phase_number"]: + parameters[param].append(parameters[f"{param}_phase{num}"]) anc = np.array(unpacked_data["ancillary"]) # convert to np array for easy slicing # Build variables in the output xarray Dataset @@ -508,7 +535,7 @@ def set_vendor(self) -> xr.Dataset: for ind, ich in enumerate(self.freq_ind_sorted): # TODO: should not access the private function, better to compute Sv_offset in parser Sv_offset[ind] = self.parser_obj._calc_Sv_offset( - self.freq_sorted[ind], unpacked_data["pulse_length"][ich] + self.freq_sorted[ind], unpacked_data["pulse_len"][ich] ) ds = xr.Dataset( @@ -532,9 +559,9 @@ def set_vendor(self) -> xr.Dataset: "A/D converter when digitizing the returned acoustic signal" }, ), - "lockout_index": ( + "lock_out_index": ( ["channel"], - unpacked_data["lockout_index"][self.freq_ind_sorted], + unpacked_data["lock_out_index"][self.freq_ind_sorted], { "long_name": "The distance, rounded to the nearest Bin Size after the " "pulse is transmitted that over which AZFP will ignore echoes" @@ -639,18 +666,29 @@ def set_vendor(self) -> xr.Dataset: ), # parameters with channel dimension from XML file "XML_transmit_duration_nominal": ( - ["channel"], + ["phase_number", "channel"], tdn, {"long_name": "(From XML file) Nominal bandwidth of transmitted pulse"}, ), # tdn comes from parameters "XML_gain_correction": ( - ["channel"], - parameters["gain"][self.freq_ind_sorted], + ["phase_number", "channel"], + parameters["gain"], {"long_name": "(From XML file) Gain correction"}, ), + "instrument_type": parameters["instrument_type"][0], + "minor": parameters["minor"], + "major": parameters["major"], + "date": parameters["date"], + "program": parameters["program"], + "cpu": parameters["cpu"], + "serial_number": parameters["serial_number"], + "board_version": parameters["board_version"], + "file_version": parameters["file_version"], + "parameter_version": parameters["parameter_version"], + "configuration_version": parameters["configuration_version"], "XML_digitization_rate": ( - ["channel"], - parameters["dig_rate"][self.freq_ind_sorted], + ["phase_number", "channel"], + parameters["dig_rate"], { "long_name": "(From XML file) Number of samples per second in kHz that is " "processed by the A/D converter when digitizing the returned acoustic " @@ -658,8 +696,8 @@ def set_vendor(self) -> xr.Dataset: }, ), "XML_lockout_index": ( - ["channel"], - parameters["lockout_index"][self.freq_ind_sorted], + ["phase_number", "channel"], + parameters["lock_out_index"], { "long_name": "(From XML file) The distance, rounded to the nearest " "Bin Size after the pulse is transmitted that over which AZFP will " @@ -680,24 +718,39 @@ def set_vendor(self) -> xr.Dataset: "units": "dB re 1uPa/V at 1m", }, ), - "VTX": ( + "VTX0": ( + ["channel"], + parameters["VTX0"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 0 sent to the transducer"}, + ), + "VTX1": ( + ["channel"], + parameters["VTX1"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 1 sent to the transducer"}, + ), + "VTX2": ( ["channel"], - parameters["VTX"][self.freq_ind_sorted], - {"long_name": "Amplified voltage sent to the transducer"}, + parameters["VTX2"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 2 sent to the transducer"}, + ), + "VTX3": ( + ["channel"], + parameters["VTX3"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 3 sent to the transducer"}, ), "Sv_offset": (["channel"], Sv_offset), "number_of_samples_digitized_per_pings": ( - ["channel"], - parameters["range_samples"][self.freq_ind_sorted], + ["phase_number", "channel"], + parameters["range_samples"], ), "number_of_digitized_samples_averaged_per_pings": ( - ["channel"], - parameters["range_averaging_samples"][self.freq_ind_sorted], + ["phase_number", "channel"], + parameters["range_averaging_samples"], ), # parameters with dim len=0 from XML file "XML_sensors_flag": parameters["sensors_flag"], "XML_burst_interval": ( - [], + ["phase_number"], parameters["burst_interval"], { "long_name": "Time in seconds between bursts or between pings if the burst " @@ -706,8 +759,14 @@ def set_vendor(self) -> xr.Dataset: ), "XML_sonar_serial_number": parameters["serial_number"], "number_of_frequency": parameters["num_freq"], - "number_of_pings_per_burst": parameters["pings_per_burst"], - "average_burst_pings_flag": parameters["average_burst_pings"], + "number_of_pings_per_burst": ( + ["phase_number"], + parameters["pings_per_burst"], + ), + "average_burst_pings_flag": ( + ["phase_number"], + parameters["average_burst_pings"], + ), # temperature coefficients from XML file **{ f"temperature_k{var}": ( @@ -763,6 +822,10 @@ def set_vendor(self) -> xr.Dataset: list(range(len(unpacked_data["ancillary"][0]))), ), "ad_len": (["ad_len"], list(range(len(unpacked_data["ad"][0])))), + "phase_number": ( + ["phase_number"], + sorted([int(num) for num in parameters["phase_number"]]), + ), }, ) return set_time_encodings(ds) diff --git a/echopype/convert/set_groups_base.py b/echopype/convert/set_groups_base.py index 4cb2c5353..ddbba24bb 100644 --- a/echopype/convert/set_groups_base.py +++ b/echopype/convert/set_groups_base.py @@ -1,8 +1,7 @@ import abc import warnings -from typing import List, Set, Tuple +from typing import List, Set -import dask.array import numpy as np import pynmea2 import xarray as xr @@ -51,7 +50,7 @@ def __init__( else: self.compression_settings = COMPRESSION_SETTINGS[self.engine] - self._varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] + self._varattrs = self.parsed2zarr_obj._varattrs # self._beamgroups must be a list of dicts, eg: # [{"name":"Beam_group1", "descr":"contains complex backscatter data # and other beam or channel-specific data."}] @@ -361,229 +360,3 @@ def beam_groups_to_convention( self._add_ping_time_dim(ds, beam_ping_time_names, ping_time_only_names) self._add_beam_dim(ds, beam_only_names, beam_ping_time_names) - - def _get_channel_ids(self, chan_str: np.ndarray) -> List[str]: - """ - Obtains the channel IDs associated with ``chan_str``. - - Parameters - ---------- - chan_str : np.ndarray - A numpy array of strings corresponding to the - keys of ``config_datagram["transceivers"]`` - - Returns - ------- - A list of strings representing the channel IDS - """ - if self.sonar_model in ["EK60", "ES70"]: - return [ - self.parser_obj.config_datagram["transceivers"][int(i)]["channel_id"] - for i in chan_str - ] - else: - return [ - self.parser_obj.config_datagram["configuration"][i]["channel_id"] for i in chan_str - ] - - def _get_power_dataarray(self, zarr_path: str) -> xr.DataArray: - """ - Constructs a DataArray from a Dask array for the power - data. - - Parameters - ---------- - zarr_path: str - Path to the zarr file that contain the power data - - Returns - ------- - DataArray named "backscatter_r" representing the - power data. - """ - - # collect variables associated with the power data - power = dask.array.from_zarr(zarr_path, component="power/power") - - pow_time_path = "power/" + self.parsed2zarr_obj.power_dims[0] - pow_chan_path = "power/" + self.parsed2zarr_obj.power_dims[1] - power_time = dask.array.from_zarr(zarr_path, component=pow_time_path).compute() - power_channel = dask.array.from_zarr(zarr_path, component=pow_chan_path).compute() - - # obtain channel names for power data - pow_chan_names = self._get_channel_ids(power_channel) - - backscatter_r = xr.DataArray( - data=power, - coords={ - "ping_time": ( - ["ping_time"], - power_time, - self._varattrs["beam_coord_default"]["ping_time"], - ), - "channel": ( - ["channel"], - pow_chan_names, - self._varattrs["beam_coord_default"]["channel"], - ), - "range_sample": ( - ["range_sample"], - np.arange(power.shape[2]), - self._varattrs["beam_coord_default"]["range_sample"], - ), - }, - name="backscatter_r", - attrs={ - "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], - "units": "dB", - }, - ) - - return backscatter_r - - def _get_angle_dataarrays(self, zarr_path: str) -> Tuple[xr.DataArray, xr.DataArray]: - """ - Constructs the DataArrays from Dask arrays associated - with the angle data. - - Parameters - ---------- - zarr_path: str - Path to the zarr file that contains the angle data - - Returns - ------- - DataArrays named "angle_athwartship" and "angle_alongship", - respectively, representing the angle data. - """ - - # collect variables associated with the angle data - angle_along = dask.array.from_zarr(zarr_path, component="angle/angle_alongship") - angle_athwart = dask.array.from_zarr(zarr_path, component="angle/angle_athwartship") - - ang_time_path = "angle/" + self.parsed2zarr_obj.angle_dims[0] - ang_chan_path = "angle/" + self.parsed2zarr_obj.angle_dims[1] - angle_time = dask.array.from_zarr(zarr_path, component=ang_time_path).compute() - angle_channel = dask.array.from_zarr(zarr_path, component=ang_chan_path).compute() - - # obtain channel names for angle data - ang_chan_names = self._get_channel_ids(angle_channel) - - array_coords = { - "ping_time": ( - ["ping_time"], - angle_time, - self._varattrs["beam_coord_default"]["ping_time"], - ), - "channel": ( - ["channel"], - ang_chan_names, - self._varattrs["beam_coord_default"]["channel"], - ), - "range_sample": ( - ["range_sample"], - np.arange(angle_athwart.shape[2]), - self._varattrs["beam_coord_default"]["range_sample"], - ), - } - - angle_athwartship = xr.DataArray( - data=angle_athwart, - coords=array_coords, - name="angle_athwartship", - attrs={ - "long_name": "electrical athwartship angle", - "comment": ( - "Introduced in echopype for Simrad echosounders. " # noqa - + "The athwartship angle corresponds to the major angle in SONAR-netCDF4 vers 2. " # noqa - ), - }, - ) - - angle_alongship = xr.DataArray( - data=angle_along, - coords=array_coords, - name="angle_alongship", - attrs={ - "long_name": "electrical alongship angle", - "comment": ( - "Introduced in echopype for Simrad echosounders. " # noqa - + "The alongship angle corresponds to the minor angle in SONAR-netCDF4 vers 2. " # noqa - ), - }, - ) - - return angle_athwartship, angle_alongship - - def _get_complex_dataarrays(self, zarr_path: str) -> Tuple[xr.DataArray, xr.DataArray]: - """ - Constructs the DataArrays from Dask arrays associated - with the complex data. - - Parameters - ---------- - zarr_path: str - Path to the zarr file that contains the complex data - - Returns - ------- - DataArrays named "backscatter_r" and "backscatter_i", - respectively, representing the complex data. - """ - - # collect variables associated with the complex data - complex_r = dask.array.from_zarr(zarr_path, component="complex/backscatter_r") - complex_i = dask.array.from_zarr(zarr_path, component="complex/backscatter_i") - - comp_time_path = "complex/" + self.parsed2zarr_obj.complex_dims[0] - comp_chan_path = "complex/" + self.parsed2zarr_obj.complex_dims[1] - complex_time = dask.array.from_zarr(zarr_path, component=comp_time_path).compute() - complex_channel = dask.array.from_zarr(zarr_path, component=comp_chan_path).compute() - - # obtain channel names for complex data - comp_chan_names = self._get_channel_ids(complex_channel) - - array_coords = { - "ping_time": ( - ["ping_time"], - complex_time, - self._varattrs["beam_coord_default"]["ping_time"], - ), - "channel": ( - ["channel"], - comp_chan_names, - self._varattrs["beam_coord_default"]["channel"], - ), - "range_sample": ( - ["range_sample"], - np.arange(complex_r.shape[2]), - self._varattrs["beam_coord_default"]["range_sample"], - ), - "beam": ( - ["beam"], - np.arange(start=1, stop=complex_r.shape[3] + 1).astype(str), - self._varattrs["beam_coord_default"]["beam"], - ), - } - - backscatter_r = xr.DataArray( - data=complex_r, - coords=array_coords, - name="backscatter_r", - attrs={ - "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], - "units": "dB", - }, - ) - - backscatter_i = xr.DataArray( - data=complex_i, - coords=array_coords, - name="backscatter_i", - attrs={ - "long_name": self._varattrs["beam_var_default"]["backscatter_i"]["long_name"], - "units": "dB", - }, - ) - - return backscatter_r, backscatter_i diff --git a/echopype/convert/set_groups_ek60.py b/echopype/convert/set_groups_ek60.py index f5270014b..c8529091b 100644 --- a/echopype/convert/set_groups_ek60.py +++ b/echopype/convert/set_groups_ek60.py @@ -336,9 +336,8 @@ def _set_beam_group1_zarr_vars(self, ds: xr.Dataset) -> xr.Dataset: # functions below. # obtain DataArrays using zarr variables - zarr_path = self.parsed2zarr_obj.zarr_file_name - backscatter_r = self._get_power_dataarray(zarr_path) - angle_athwartship, angle_alongship = self._get_angle_dataarrays(zarr_path) + backscatter_r = self.parsed2zarr_obj.power_dataarray + angle_athwartship, angle_alongship = self.parsed2zarr_obj.angle_dataarrays # append DataArrays created from zarr file ds = ds.assign( diff --git a/echopype/convert/set_groups_ek80.py b/echopype/convert/set_groups_ek80.py index 51c4aae23..77b06cdf3 100644 --- a/echopype/convert/set_groups_ek80.py +++ b/echopype/convert/set_groups_ek80.py @@ -74,7 +74,7 @@ def __init__(self, *args, **kwargs): # if we have zarr files, create parser_obj.ch_ids if self.parsed2zarr_obj.temp_zarr_dir: for k, v in self.parsed2zarr_obj.p2z_ch_ids.items(): - self.parser_obj.ch_ids[k] = self._get_channel_ids(v) + self.parser_obj.ch_ids[k] = self.parsed2zarr_obj._get_channel_ids(v) # obtain sorted channel dict in ascending order for each usage scenario self.sorted_channel = { @@ -1067,26 +1067,25 @@ def _get_ds_beam_power_zarr(self, ds_invariant_power: xr.Dataset) -> xr.Dataset: # functions below. # obtain DataArrays using zarr variables - zarr_path = self.parsed2zarr_obj.zarr_file_name - backscatter_r = self._get_power_dataarray(zarr_path) - angle_athwartship, angle_alongship = self._get_angle_dataarrays(zarr_path) + backscatter_r = self.parsed2zarr_obj.power_dataarray + angle_athwartship, angle_alongship = self.parsed2zarr_obj.angle_dataarrays + + # Obtain RAW4 transmit pulse data if it exists + tx_pulse_list = [] + if ( + hasattr(self.parsed2zarr_obj, "tx_complex_dataarrays") + and self.parsed2zarr_obj.tx_complex_dataarrays is not None + ): + tx_pulse_list = list(self.parsed2zarr_obj.tx_complex_dataarrays) # create power related ds using DataArrays created from zarr file - ds_power = xr.merge([backscatter_r, angle_athwartship, angle_alongship]) + ds_power = xr.merge( + [backscatter_r, angle_athwartship, angle_alongship] + tx_pulse_list, + combine_attrs="override", + ) ds_power = set_time_encodings(ds_power) - # obtain additional variables that need to be added to ds_power - ds_tmp = [] - for ch in self.sorted_channel["power"]: - ds_data = self._add_trasmit_pulse_complex(ds_tmp=xr.Dataset(), ch=ch) - ds_data = set_time_encodings(ds_data) - - ds_data = self._attach_vars_to_ds_data(ds_data, ch, rs_size=ds_power.range_sample.size) - ds_tmp.append(ds_data) - - ds_tmp = self.merge_save(ds_tmp, ds_invariant_power) - - return xr.merge([ds_tmp, ds_power], combine_attrs="override") + return xr.merge([ds_invariant_power, ds_power], combine_attrs="override") def _get_ds_complex_zarr(self, ds_invariant_complex: xr.Dataset) -> xr.Dataset: """ @@ -1109,8 +1108,7 @@ def _get_ds_complex_zarr(self, ds_invariant_complex: xr.Dataset) -> xr.Dataset: # functions below. # obtain DataArrays using zarr variables - zarr_path = self.parsed2zarr_obj.zarr_file_name - backscatter_r, backscatter_i = self._get_complex_dataarrays(zarr_path) + backscatter_r, backscatter_i = self.parsed2zarr_obj.complex_dataarrays # create power related ds using DataArrays created from zarr file ds_complex = xr.merge([backscatter_r, backscatter_i]) diff --git a/echopype/convert/utils/ek.py b/echopype/convert/utils/ek.py new file mode 100644 index 000000000..3393f1723 --- /dev/null +++ b/echopype/convert/utils/ek.py @@ -0,0 +1,89 @@ +import sys +from functools import reduce + +import numpy as np +import pandas as pd +import psutil + +COMPLEX_VAR = "complex" + + +def _get_power_dims(dgram_zarr_vars): + return list(reduce(lambda x, y: {*x, *y}, dgram_zarr_vars.values())) + + +def _extract_datagram_dfs(zarr_datagrams, dgram_zarr_vars): + data_keys = dgram_zarr_vars.keys() + power_dims = _get_power_dims(dgram_zarr_vars) + power_angle = [k for k in data_keys if k != COMPLEX_VAR] + + datagram_df = pd.DataFrame.from_dict(zarr_datagrams) + + pow_ang_df = datagram_df[power_dims + power_angle] + + complex_df = None + if COMPLEX_VAR in datagram_df: + # Not EK60 + complex_df = datagram_df[power_dims + [COMPLEX_VAR]] + + # Clean up nans if there's any + if isinstance(pow_ang_df, pd.DataFrame): + pow_ang_df = pow_ang_df.dropna().reset_index(drop=True) + + if isinstance(complex_df, pd.DataFrame): + complex_df = complex_df.dropna().reset_index(drop=True) + + return pow_ang_df, complex_df + + +def get_req_mem(datagram_df, dgram_zarr_vars): + total_req_mem = 0 + if datagram_df is not None: + power_dims = _get_power_dims(dgram_zarr_vars) + df_shapes = datagram_df.apply( + lambda col: col.unique().shape + if col.name in power_dims + else col.apply(lambda row: row.shape).max(), + result_type="reduce", + ) + + for k, v in dgram_zarr_vars.items(): + if k in df_shapes: + cols = v + [k] + expected_shape = reduce(lambda x, y: x + y, df_shapes[cols]) + itemsize = datagram_df[k].dtype.itemsize + req_mem = np.prod(expected_shape) * itemsize + total_req_mem += req_mem + + return total_req_mem + + +def _get_zarr_dgrams_size(zarr_datagrams) -> int: + """ + Returns the size in bytes of the list of zarr + datagrams. + """ + + size = 0 + for i in zarr_datagrams: + size += sum([sys.getsizeof(val) for val in i.values()]) + + return size + + +def should_use_swap(zarr_datagrams, dgram_zarr_vars, mem_mult: float = 0.3) -> bool: + zdgrams_mem = _get_zarr_dgrams_size(zarr_datagrams) + + # Estimate expansion size + pow_ang_df, complex_df = _extract_datagram_dfs(zarr_datagrams, dgram_zarr_vars) + pow_ang_mem = get_req_mem(pow_ang_df, dgram_zarr_vars) + complex_mem = get_req_mem(complex_df, dgram_zarr_vars) + total_mem = pow_ang_mem + complex_mem + + # get statistics about system memory usage + mem = psutil.virtual_memory() + + # approx. the amount of memory that will be used after expansion + req_mem = mem.used - zdgrams_mem + total_mem + + return mem.total * mem_mult < req_mem diff --git a/echopype/convert/utils/ek_raw_parsers.py b/echopype/convert/utils/ek_raw_parsers.py index dae9161a7..8c349a0f2 100644 --- a/echopype/convert/utils/ek_raw_parsers.py +++ b/echopype/convert/utils/ek_raw_parsers.py @@ -16,6 +16,7 @@ import numpy as np from ...utils.log import _init_logger +from ...utils.misc import camelcase2snakecase from .ek_date_conversion import nt_to_unix TCVR_CH_NUM_MATCHER = re.compile(r"\d{6}-\w{1,2}|\w{12}-\w{1,2}") @@ -706,22 +707,6 @@ def _unpack_contents(self, raw_string, bytes_read, version): :returns: None """ - def from_CamelCase(xml_param): - """ - convert name from CamelCase to fit with existing naming convention by - inserting an underscore before each capital and then lowering the caps - e.g. CamelCase becomes camel_case. - """ - idx = list(reversed([i for i, c in enumerate(xml_param) if c.isupper()])) - param_len = len(xml_param) - for i in idx: - # check if we should insert an underscore - if i > 0 and i < param_len: - xml_param = xml_param[:i] + "_" + xml_param[i:] - xml_param = xml_param.lower() - - return xml_param - def dict_to_dict(xml_dict, data_dict, parse_opts): """ dict_to_dict appends the ETree xml value dicts to a provided dictionary @@ -760,13 +745,13 @@ def dict_to_dict(xml_dict, data_dict, parse_opts): data_dict[parse_opts[k][1]] = data else: # add using the default key name wrangling - data_dict[from_CamelCase(k)] = data + data_dict[camelcase2snakecase(k)] = data else: # nothing to do with the value string data = xml_dict[k] # add the parameter to the provided dictionary - data_dict[from_CamelCase(k)] = data + data_dict[camelcase2snakecase(k)] = data header_values = struct.unpack( self.header_fmt(version), raw_string[: self.header_size(version)] diff --git a/echopype/echodata/echodata.py b/echopype/echodata/echodata.py index 083eb47a2..b1293d1b1 100644 --- a/echopype/echodata/echodata.py +++ b/echopype/echodata/echodata.py @@ -1,5 +1,4 @@ import datetime -import shutil import warnings from html import escape from pathlib import Path @@ -76,18 +75,19 @@ def __init__( self._varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] - def __del__(self): - # TODO: this destructor seems to not work in Jupyter Lab if restart or - # even clear all outputs is used. It will work if you explicitly delete the object - - if (self.parsed2zarr_obj is not None) and (self.parsed2zarr_obj.zarr_file_name is not None): + def cleanup(self): + if (self.parsed2zarr_obj is not None) and (self.parsed2zarr_obj.store is not None): # get Path object of temporary zarr file created by Parsed2Zarr - p2z_temp_file = Path(self.parsed2zarr_obj.zarr_file_name) + p2z_temp_file = self.parsed2zarr_obj.store # remove temporary directory created by Parsed2Zarr, if it exists - if p2z_temp_file.exists(): - # TODO: do we need to check file permissions here? - shutil.rmtree(p2z_temp_file) + if p2z_temp_file.fs.exists(p2z_temp_file.root): + p2z_temp_file.fs.rm(p2z_temp_file.root, recursive=True) + + def __del__(self): + # TODO: this destructor seems to not work in Jupyter Lab if restart or + # even clear all outputs is used. It will work if you explicitly delete the object + self.cleanup() def __str__(self) -> str: fpath = "Internal Memory" @@ -517,9 +517,21 @@ def _clip_by_time_dim(external_ds, ext_time_dim_name): if v["ext_time_dim_name"] == "scalar" ] for platform_var in scalar_vars: + # Set timestamp equal to the first ping time whenever either + # latitude or longitude is updated without a time dimension ext_var = mappings_expanded[platform_var]["external_var"] - # Replace the scalar value and add a history attribute - platform[platform_var].data = float(extra_platform_data[ext_var].data) + if platform_var == "latitude" or platform_var == "longitude": + platform[platform_var].data = np.array([extra_platform_data[ext_var].data]) + platform[platform_var] = platform[platform_var].assign_coords( + **{ + platform[platform_var].dims[0]: [ + self["Sonar/Beam_group1"]["ping_time"].data[0] + ] + } + ) + else: + # Replace the scalar value and add a history attribute + platform[platform_var].data = float(extra_platform_data[ext_var].data) platform[platform_var] = platform[platform_var].assign_attrs( **{"history": f"{history_attr}. From external {ext_var} variable."} ) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py new file mode 100644 index 000000000..9c30c504f --- /dev/null +++ b/echopype/tests/commongrid/conftest.py @@ -0,0 +1,789 @@ +import pytest + +import xarray as xr +import numpy as np +import pandas as pd +from typing import Literal + +from echopype.consolidate import add_depth +from echopype.commongrid.utils import ( + get_distance_from_latlon, +) +import echopype as ep + + +@pytest.fixture +def random_number_generator(): + """Random number generator for tests""" + return np.random.default_rng() + + +@pytest.fixture +def mock_nan_ilocs(): + """NaN i locations for irregular Sv dataset + + It's a list of tuples, each tuple contains + (channel, ping_time, range_sample) + + Notes + ----- + This was created with the following code: + + ``` + import numpy as np + + random_positions = [] + for i in range(20): + random_positions.append(( + np.random.randint(0, 2), + np.random.randint(0, 5), + np.random.randint(0, 20)) + ) + ``` + """ + return [ + (1, 1, 10), + (1, 0, 16), + (0, 3, 6), + (0, 2, 11), + (0, 2, 6), + (1, 1, 14), + (0, 1, 17), + (1, 4, 19), + (0, 3, 3), + (0, 0, 19), + (0, 1, 5), + (1, 2, 9), + (1, 4, 18), + (0, 1, 5), + (0, 4, 4), + (0, 1, 6), + (1, 2, 2), + (0, 1, 2), + (0, 4, 8), + (0, 1, 1), + ] + + +@pytest.fixture +def mock_parameters(): + """Small mock parameters""" + return { + "channel_len": 2, + "ping_time_len": 10, + "depth_len": 20, + "ping_time_interval": "0.3S", + } + + +@pytest.fixture +def mock_Sv_sample(mock_parameters): + """ + Mock Sv sample + + Dimension: (2, 10, 20) + """ + channel_len = mock_parameters["channel_len"] + ping_time_len = mock_parameters["ping_time_len"] + depth_len = mock_parameters["depth_len"] + + depth_data = np.linspace(0, 1, num=depth_len) + return np.tile(depth_data, (channel_len, ping_time_len, 1)) + + +def _add_latlon_depth(ds_Sv, latlon=False, depth=False, lat_attrs={}, lon_attrs={}, depth_offset=0): + """Adds lat lon variables and/or depth to ds_Sv""" + if latlon: + # Add lat lon + n_pings = ds_Sv.ping_time.shape[0] + latitude = np.linspace(42.48916859, 42.49071833, num=n_pings) + longitude = np.linspace(-124.88296688, -124.81919229, num=n_pings) + + ds_Sv["latitude"] = (["ping_time"], latitude, lat_attrs) + ds_Sv["longitude"] = (["ping_time"], longitude, lon_attrs) + + # Need processing level code for compute MVBS to work! + ds_Sv.attrs["processing_level"] = "Level 2A" + + if depth: + # Add depth + ds_Sv = ds_Sv.pipe(add_depth, depth_offset=depth_offset) + return ds_Sv + + +@pytest.fixture +def mock_Sv_dataset_regular(mock_parameters, mock_Sv_sample, lat_attrs, lon_attrs, depth_offset): + ds_Sv = _gen_Sv_echo_range_regular(**mock_parameters, ping_time_jitter_max_ms=0) + ds_Sv["Sv"].data = mock_Sv_sample + + # Add latlon and depth + ds_Sv = _add_latlon_depth( + ds_Sv, + latlon=True, + depth=True, + lat_attrs=lat_attrs, + lon_attrs=lon_attrs, + depth_offset=depth_offset, + ) + return ds_Sv + + +@pytest.fixture +def mock_Sv_dataset_irregular( + mock_parameters, mock_Sv_sample, mock_nan_ilocs, lat_attrs, lon_attrs, depth_offset +): + depth_interval = [0.5, 0.32, 0.2] + depth_ping_time_len = [2, 3, 5] + ds_Sv = _gen_Sv_echo_range_irregular( + **mock_parameters, + depth_interval=depth_interval, + depth_ping_time_len=depth_ping_time_len, + ping_time_jitter_max_ms=30, # Added jitter to ping_time + ) + ds_Sv["Sv"].data = mock_Sv_sample + + # Add latlon and depth + ds_Sv = _add_latlon_depth( + ds_Sv, + latlon=True, + depth=True, + lat_attrs=lat_attrs, + lon_attrs=lon_attrs, + depth_offset=depth_offset, + ) + + # Sprinkle nans around echo_range + for pos in mock_nan_ilocs: + ds_Sv["echo_range"][pos] = np.nan + return ds_Sv + + +@pytest.fixture +def mock_mvbs_inputs(): + return dict(range_meter_bin=2, ping_time_bin="1s") + + +@pytest.fixture +def mock_mvbs_array_regular(mock_Sv_dataset_regular, mock_mvbs_inputs, mock_parameters): + """ + Mock Sv sample result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + ds_Sv = mock_Sv_dataset_regular + ping_time_bin = mock_mvbs_inputs["ping_time_bin"] + range_bin = mock_mvbs_inputs["range_meter_bin"] + channel_len = mock_parameters["channel_len"] + expected_mvbs_val = _get_expected_mvbs_val(ds_Sv, ping_time_bin, range_bin, channel_len) + + return expected_mvbs_val + + +@pytest.fixture +def mock_nasc_array_regular(mock_Sv_dataset_regular, mock_parameters): + """ + Mock Sv sample result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + ds_Sv = mock_Sv_dataset_regular + dist_bin = 0.5 + range_bin = 2 + channel_len = mock_parameters["channel_len"] + expected_nasc_val = _get_expected_nasc_val(ds_Sv, dist_bin, range_bin, channel_len) + + return expected_nasc_val + + +@pytest.fixture +def mock_mvbs_array_irregular(mock_Sv_dataset_irregular, mock_mvbs_inputs, mock_parameters): + """ + Mock Sv sample irregular result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + ds_Sv = mock_Sv_dataset_irregular + ping_time_bin = mock_mvbs_inputs["ping_time_bin"] + range_bin = mock_mvbs_inputs["range_meter_bin"] + channel_len = mock_parameters["channel_len"] + expected_mvbs_val = _get_expected_mvbs_val(ds_Sv, ping_time_bin, range_bin, channel_len) + + return expected_mvbs_val + + +@pytest.fixture +def mock_nasc_array_irregular(mock_Sv_dataset_irregular, mock_parameters): + """ + Mock Sv sample result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + ds_Sv = mock_Sv_dataset_irregular + dist_bin = 0.5 + range_bin = 2 + channel_len = mock_parameters["channel_len"] + expected_nasc_val = _get_expected_nasc_val(ds_Sv, dist_bin, range_bin, channel_len) + + return expected_nasc_val + + +@pytest.fixture( + params=[ + ( + ("EK60", "ncei-wcsd", "Summer2017-D20170719-T211347.raw"), + "EK60", + None, + {}, + ), + ( + ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), + "EK80", + None, + {"waveform_mode": "BB", "encode_mode": "complex"}, + ), + ( + ("EK80_NEW", "D20211004-T233354.raw"), + "EK80", + None, + {"waveform_mode": "CW", "encode_mode": "power"}, + ), + ( + ("EK80_NEW", "D20211004-T233115.raw"), + "EK80", + None, + {"waveform_mode": "CW", "encode_mode": "complex"}, + ), + (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), + (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), + ( + ("AD2CP", "raw", "090", "rawtest.090.00001.ad2cp"), + "AD2CP", + None, + {}, + ), + ], + ids=[ + "ek60_cw_power", + "ek80_bb_complex", + "ek80_cw_power", + "ek80_cw_complex", + "es70", + "azfp", + "ad2cp", + ], +) +def test_data_samples(request, test_path): + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = request.param + if sonar_model.lower() in ["es70", "ad2cp"]: + pytest.xfail( + reason="Not supported at the moment", + ) + path_model, *paths = filepath + filepath = test_path[path_model].joinpath(*paths) + + if azfp_xml_path is not None: + path_model, *paths = azfp_xml_path + azfp_xml_path = test_path[path_model].joinpath(*paths) + return ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) + + +@pytest.fixture +def regular_data_params(): + return { + "channel_len": 4, + "depth_len": 4000, + "ping_time_len": 100, + "ping_time_jitter_max_ms": 0, + } + + +@pytest.fixture +def ds_Sv_echo_range_regular(regular_data_params, random_number_generator): + return _gen_Sv_echo_range_regular( + **regular_data_params, + random_number_generator=random_number_generator, + ) + + +@pytest.fixture +def latlon_history_attr(): + return ( + "2023-08-31 12:00:00.000000 +00:00. " + "Interpolated or propagated from Platform latitude/longitude." # noqa + ) + + +@pytest.fixture +def lat_attrs(latlon_history_attr): + """Latitude attributes""" + return { + "long_name": "Platform latitude", + "standard_name": "latitude", + "units": "degrees_north", + "valid_range": "(-90.0, 90.0)", + "history": latlon_history_attr, + } + + +@pytest.fixture +def lon_attrs(latlon_history_attr): + """Longitude attributes""" + return { + "long_name": "Platform longitude", + "standard_name": "longitude", + "units": "degrees_east", + "valid_range": "(-180.0, 180.0)", + "history": latlon_history_attr, + } + + +@pytest.fixture +def depth_offset(): + """Depth offset for calculating depth""" + return 2.5 + + +@pytest.fixture +def ds_Sv_echo_range_regular_w_latlon(ds_Sv_echo_range_regular, lat_attrs, lon_attrs): + """Sv dataset with latitude and longitude""" + ds_Sv_echo_range_regular = _add_latlon_depth( + ds_Sv_echo_range_regular, latlon=True, lat_attrs=lat_attrs, lon_attrs=lon_attrs + ) + return ds_Sv_echo_range_regular + + +@pytest.fixture +def ds_Sv_echo_range_regular_w_depth(ds_Sv_echo_range_regular, depth_offset): + """Sv dataset with depth""" + return ds_Sv_echo_range_regular.pipe(add_depth, depth_offset=depth_offset) + + +@pytest.fixture +def ds_Sv_echo_range_irregular(random_number_generator): + depth_interval = [0.5, 0.32, 0.13] + depth_ping_time_len = [100, 300, 200] + ping_time_len = 600 + ping_time_interval = "0.3S" + return _gen_Sv_echo_range_irregular( + depth_interval=depth_interval, + depth_ping_time_len=depth_ping_time_len, + ping_time_len=ping_time_len, + ping_time_interval=ping_time_interval, + ping_time_jitter_max_ms=0, + random_number_generator=random_number_generator, + ) + + +# Helper functions for NASC testing +def _create_dataset(i, sv, dim, rng): + dims = { + "range_sample": np.arange(5), + "distance_nmi": np.arange(5), + } + # Add one for other channel + sv = sv + (rng.random() * 5) + Sv = ep.utils.compute._lin2log(sv) + ds_Sv = xr.Dataset( + { + "Sv": (list(dims.keys()), Sv), + "depth": (list(dims.keys()), np.array([dim] * 5).T), + "ping_time": ( + ["distance_nmi"], + pd.date_range("2020-01-01", periods=len(dim), freq="1min"), + ), + }, + coords=dict(channel=f"ch_{i}", **dims), + ) + return ds_Sv + + +def get_NASC_echoview(ds_Sv, ch_idx=0, r0=2, r1=20): + """ + Computes NASC using echoview's method, 1 channel only, + as described in https://gist.github.com/leewujung/3b058ab63c3b897b273b33b907b62f6d + """ + r = ds_Sv.depth.isel(channel=ch_idx, distance_nmi=0).values + # get r0 and r1 indexes + # these are used to slice the desired Sv samples + r0 = np.argmin(abs(r - r0)) + r1 = np.argmin(abs(r - r1)) + + sh = np.r_[np.diff(r), np.nan] + + sv = ds_Sv["Sv"].pipe(ep.utils.compute._log2lin).isel(channel=ch_idx).values + sv_mean_echoview = np.nanmean(sv[r0:r1]) + h_mean_echoview = np.sum(sh[r0:r1]) * sv.shape[1] / sv.shape[1] + + NASC_echoview = sv_mean_echoview * h_mean_echoview * 4 * np.pi * 1852**2 + return NASC_echoview + + +@pytest.fixture +def mock_Sv_dataset_NASC(mock_parameters, random_number_generator): + channel_len = mock_parameters["channel_len"] + dim0 = np.array([0.5, 1.5, 2.5, 3.5, 9]) + sv0 = np.array( + [ + [1.0, 2.0, 3.0, 4.0, np.nan], + [6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0], + [16.0, 17.0, 18.0, 19.0, np.nan], + [21.0, 22.0, 23.0, 24.0, 25.0], + ] + ) + return xr.concat( + [_create_dataset(i, sv0, dim0, random_number_generator) for i in range(channel_len)], + dim="channel", + ) + + +# Helper functions to generate mock Sv and MVBS dataset +def _get_expected_nasc_val( + ds_Sv: xr.Dataset, dist_bin: str, range_bin: float, channel_len: int = 2 +) -> np.ndarray: + """ + Helper functions to generate expected NASC outputs from mock Sv dataset + by brute-force looping and compute the mean + + Parameters + ---------- + ds_Sv : xr.Dataset + Mock Sv dataset + dist_bin : float + Distance bin + range_bin : float + Range bin + channel_len : int, default 2 + Number of channels + """ + # Get distance from lat/lon in nautical miles + dist_nmi = get_distance_from_latlon(ds_Sv) + ds_Sv = ds_Sv.assign_coords({"distance_nmi": ("ping_time", dist_nmi)}).swap_dims( + {"ping_time": "distance_nmi"} + ) + + # create bin information along distance_nmi + # this computes the distance max since there might NaNs in the data + dist_max = ds_Sv["distance_nmi"].max() + dist_interval = np.arange(0, dist_max + dist_bin, dist_bin) + + # create bin information for depth + # this computes the depth max since there might NaNs in the data + depth_max = ds_Sv["depth"].max() + range_interval = np.arange(0, depth_max + range_bin, range_bin) + + sv = ds_Sv["Sv"].pipe(ep.utils.compute._log2lin) + + # Compute sv mean + sv_mean = _brute_mean_reduce_3d( + sv, ds_Sv, "depth", "distance_nmi", channel_len, dist_interval, range_interval + ) + + # Calculate denominator + h_mean_denom = np.ones(len(dist_interval) - 1) * np.nan + for x_idx in range(len(dist_interval) - 1): + x_range = ds_Sv["distance_nmi"].sel( + distance_nmi=slice(dist_interval[x_idx], dist_interval[x_idx + 1]) + ) + h_mean_denom[x_idx] = float(len(x_range.data)) + + # Calculate numerator + r_diff = ds_Sv["depth"].diff(dim="range_sample", label="lower") + depth = ds_Sv["depth"].isel(**{"range_sample": slice(0, -1)}) + h_mean_num = np.ones((channel_len, len(dist_interval) - 1, len(range_interval) - 1)) * np.nan + for ch_idx in range(channel_len): + for x_idx in range(len(dist_interval) - 1): + for r_idx in range(len(range_interval) - 1): + x_range = depth.isel(channel=ch_idx).sel( + **{"distance_nmi": slice(dist_interval[x_idx], dist_interval[x_idx + 1])} + ) + r_idx_active = np.logical_and( + x_range.data >= range_interval[r_idx], + x_range.data < range_interval[r_idx + 1], + ) + r_tmp = ( + r_diff.isel(channel=ch_idx) + .sel(**{"distance_nmi": slice(dist_interval[x_idx], dist_interval[x_idx + 1])}) + .data[r_idx_active] + ) + if 0 in r_tmp.shape: + h_mean_num[ch_idx, x_idx, r_idx] = np.nan + else: + h_mean_num[ch_idx, x_idx, r_idx] = np.sum(r_tmp) + + # Compute raw NASC + h_mean_num_da = xr.DataArray(h_mean_num, dims=["channel", "distance_nmi", "depth"]) + h_mean_denom_da = xr.DataArray(h_mean_denom, dims=["distance_nmi"]) + h_mean = h_mean_num_da / h_mean_denom_da + # Combine to compute NASC + return sv_mean * h_mean * 4 * np.pi * 1852**2 + + +def _brute_mean_reduce_3d( + sv: xr.DataArray, + ds_Sv: xr.Dataset, + range_var: Literal["echo_range", "depth"], + x_var: Literal["ping_time", "distance_nmi"], + channel_len: int, + x_interval: list, + range_interval: list, +) -> np.ndarray: + """ + Perform brute force reduction on sv data for 3 Dimensions + + Parameters + ---------- + sv : xr.DataArray + A DataArray containing ``sv`` data with coordinates + ds_Sv : xr.Dataset + A Dataset containing ``Sv`` and other variables, + depending on computation performed. + + For MVBS computation, this must contain ``Sv`` and ``echo_range`` data + with coordinates ``channel``, ``ping_time``, and ``range_sample`` + at bare minimum. + Or this can contain ``Sv`` and ``depth`` data with similar coordinates. + + For NASC computatioon this must contain ``Sv`` and ``depth`` data + with coordinates ``channel``, ``distance_nmi``, and ``range_sample``. + range_var: {'echo_range', 'depth'}, default 'echo_range' + The variable to use for range binning. + Either ``echo_range`` or ``depth``. + + **For NASC, this must be ``depth``.** + x_var : {'ping_time', 'distance_nmi'}, default 'ping_time' + The variable to use for x binning. This will determine + if computation is for MVBS or NASC. + channel_len : int + Number of channels + x_interval : list + 1D array or interval index representing + the bins required for ``ping_time`` or ``distance_nmi``. + range_interval : list + 1D array or interval index representing + the bins required for ``range_var`` + """ + mean_vals = np.ones((channel_len, len(x_interval) - 1, len(range_interval) - 1)) * np.nan + + for ch_idx in range(channel_len): + for x_idx in range(len(x_interval) - 1): + for r_idx in range(len(range_interval) - 1): + x_range = ( + ds_Sv[range_var] + .isel(channel=ch_idx) + .sel(**{x_var: slice(x_interval[x_idx], x_interval[x_idx + 1])}) + ) + r_idx_active = np.logical_and( + x_range.data >= range_interval[r_idx], + x_range.data < range_interval[r_idx + 1], + ) + sv_tmp = ( + sv.isel(channel=ch_idx) + .sel(**{x_var: slice(x_interval[x_idx], x_interval[x_idx + 1])}) + .data[r_idx_active] + ) + if 0 in sv_tmp.shape: + mean_vals[ch_idx, x_idx, r_idx] = np.nan + else: + mean_vals[ch_idx, x_idx, r_idx] = np.mean(sv_tmp) + return mean_vals + + +def _get_expected_mvbs_val( + ds_Sv: xr.Dataset, ping_time_bin: str, range_bin: float, channel_len: int = 2 +) -> np.ndarray: + """ + Helper functions to generate expected MVBS outputs from mock Sv dataset + by brute-force looping and compute the mean + + Parameters + ---------- + ds_Sv : xr.Dataset + Mock Sv dataset + ping_time_bin : str + Ping time bin + range_bin : float + Range bin + channel_len : int, default 2 + Number of channels + """ + # create bin information needed for ping_time + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .first() # Not actually being used, but needed to get the bin groups + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values + + # create bin information for echo_range + # this computes the echo range max since there might NaNs in the data + echo_range_max = ds_Sv["echo_range"].max() + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) + + sv = ds_Sv["Sv"].pipe(ep.utils.compute._log2lin) + + expected_mvbs_val = _brute_mean_reduce_3d( + sv, ds_Sv, "echo_range", "ping_time", channel_len, ping_interval, range_interval + ) + return ep.utils.compute._lin2log(expected_mvbs_val) + + +def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0): + ping_time = pd.date_range("2018-07-01", periods=ping_time_len, freq=ping_time_interval) + if ping_time_jitter_max_ms != 0: # if to add jitter + jitter = ( + np.random.randint(ping_time_jitter_max_ms, size=ping_time_len) / 1000 + ) # convert to seconds + ping_time = pd.to_datetime(ping_time.astype(int) / 1e9 + jitter, unit="s") + return ping_time + + +def _gen_Sv_echo_range_regular( + channel_len=2, + depth_len=100, + depth_interval=0.5, + ping_time_len=600, + ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + + Parameters + ------------ + channel_len + number of channels + depth_len + number of total depth bins + depth_interval + depth intervals, may have multiple values + ping_time_len + total number of ping_time + ping_time_interval + interval between pings + ping_time_jitter_max_ms + jitter of ping_time in milliseconds + """ + + if random_number_generator is None: + random_number_generator = np.random.default_rng() + + # regular echo_range + echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)), + ), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + }, + ) + + return ds_Sv + + +def _gen_Sv_echo_range_irregular( + channel_len=2, + depth_len=100, + depth_interval=[0.5, 0.32, 0.13], + depth_ping_time_len=[100, 300, 200], + ping_time_len=600, + ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + + Parameters + ------------ + channel_len + number of channels + depth_len + number of total depth bins + depth_interval + depth intervals, may have multiple values + depth_ping_time_len + the number of pings to use each of the depth_interval + for example, with depth_interval=[0.5, 0.32, 0.13] + and depth_ping_time_len=[100, 300, 200], + the first 100 pings have echo_range with depth intervals of 0.5 m, + the next 300 pings have echo_range with depth intervals of 0.32 m, + and the last 200 pings have echo_range with depth intervals of 0.13 m. + ping_time_len + total number of ping_time + ping_time_interval + interval between pings + ping_time_jitter_max_ms + jitter of ping_time in milliseconds + """ + if random_number_generator is None: + random_number_generator = np.random.default_rng() + + # check input + if len(depth_interval) != len(depth_ping_time_len): + raise ValueError("The number of depth_interval and depth_ping_time_len must be equal!") + + if ping_time_len != np.array(depth_ping_time_len).sum(): + raise ValueError("The number of total pings does not match!") + + # irregular echo_range + echo_range_list = [] + for d, dp in zip(depth_interval, depth_ping_time_len): + echo_range_list.append(np.array([[np.arange(depth_len)] * dp] * channel_len) * d) + echo_range = np.hstack(echo_range_list) + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)), + ), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + }, + ) + + return ds_Sv + + +# End helper functions diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py new file mode 100644 index 000000000..6e3a84385 --- /dev/null +++ b/echopype/tests/commongrid/test_api.py @@ -0,0 +1,448 @@ +import pytest + +import numpy as np +import pandas as pd +from flox.xarray import xarray_reduce +import echopype as ep +from echopype.consolidate import add_location, add_depth +from echopype.commongrid.utils import ( + _parse_x_bin, + _groupby_x_along_channels, + get_distance_from_latlon, + compute_raw_NASC +) +from echopype.tests.commongrid.conftest import get_NASC_echoview + + +# Utilities Tests +@pytest.mark.parametrize( + ["x_bin", "x_label", "expected_result"], + [ + # Success + ("10m", "range_bin", 10.0), + ("0.2m", "range_bin", 0.2), + ("0.5nmi", "dist_bin", 0.5), + # Errored + (10, "range_bin", TypeError), + ("10km", "range_bin", ValueError), + ("10", "range_bin", ValueError), + ("10m", "invalid_label", KeyError), + ], +) +def test__parse_x_bin(x_bin, x_label, expected_result): + if x_label == "invalid_label": + expected_error_msg = r"x_label must be one of" + elif isinstance(x_bin, int): + expected_error_msg = r"must be a string" + elif x_bin in ["10km", "10"]: + expected_error_msg = r"must be in" + + if not isinstance(expected_result, float): + with pytest.raises(expected_result, match=expected_error_msg): + ep.commongrid.api._parse_x_bin(x_bin, x_label) + else: + assert ep.commongrid.api._parse_x_bin(x_bin, x_label) == expected_result + + +@pytest.mark.unit +@pytest.mark.parametrize( + ["range_var", "lat_lon"], [("depth", False), ("echo_range", False)] +) +def test__groupby_x_along_channels(request, range_var, lat_lon): + """Testing the underlying function of compute_MVBS and compute_NASC""" + range_bin = 20 + ping_time_bin = "20S" + method = "map-reduce" + + flox_kwargs = {"reindex": True} + + # Retrieve the correct dataset + if range_var == "depth": + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular_w_depth") + else: + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular") + + # compute range interval + echo_range_max = ds_Sv[range_var].max() + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) + + # create bin information needed for ping_time + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .asfreq() + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) + + sv_mean = _groupby_x_along_channels( + ds_Sv, + range_interval, + x_interval=ping_interval, + x_var="ping_time", + range_var=range_var, + method=method, + **flox_kwargs + ) + + # Check that the range_var is in the dimension + assert f"{range_var}_bins" in sv_mean.dims + + +# NASC Tests +@pytest.mark.integration +@pytest.mark.parametrize("compute_mvbs", [True, False]) +def test_compute_NASC(request, test_data_samples, compute_mvbs): + if any(request.node.callspec.id.startswith(id) for id in ["ek80", "azfp"]): + pytest.skip("Skipping NASC test for ek80 and azfp, no data available") + + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = test_data_samples + ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) + if ed.sonar_model.lower() == "azfp": + avg_temperature = ed["Environment"]["temperature"].values.mean() + env_params = { + "temperature": avg_temperature, + "salinity": 27.9, + "pressure": 59, + } + range_kwargs["env_params"] = env_params + if "azfp_cal_type" in range_kwargs: + range_kwargs.pop("azfp_cal_type") + ds_Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) + + # Adds location and depth information + ds_Sv = ds_Sv.pipe(add_location, ed).pipe( + add_depth, depth_offset=ed["Platform"].water_level.values + ) + + if compute_mvbs: + range_bin = "2m" + ping_time_bin = "1s" + + ds_Sv = ds_Sv.pipe( + ep.commongrid.compute_MVBS, + range_var="depth", + range_bin=range_bin, + ping_time_bin=ping_time_bin, + ) + + dist_bin = "0.5nmi" + range_bin = "10m" + + ds_NASC = ep.commongrid.compute_NASC(ds_Sv, range_bin=range_bin, dist_bin=dist_bin) + assert ds_NASC is not None + + dist_nmi = get_distance_from_latlon(ds_Sv) + + # Check dimensions + dist_bin = _parse_x_bin(dist_bin, "dist_bin") + range_bin = _parse_x_bin(range_bin) + da_NASC = ds_NASC["NASC"] + assert da_NASC.dims == ("channel", "distance", "depth") + assert np.all(ds_NASC["channel"].values == ds_Sv["channel"].values) + assert da_NASC["depth"].size == np.ceil(ds_Sv["depth"].max() / range_bin) + assert da_NASC["distance"].size == np.ceil(dist_nmi.max() / dist_bin) + + +@pytest.mark.unit +def test_simple_NASC_Echoview_values(mock_Sv_dataset_NASC): + dist_interval = np.array([-5, 10]) + range_interval = np.array([1, 5]) + raw_NASC = compute_raw_NASC( + mock_Sv_dataset_NASC, + range_interval, + dist_interval, + ) + for ch_idx, _ in enumerate(raw_NASC.channel): + NASC_echoview = get_NASC_echoview(mock_Sv_dataset_NASC, ch_idx) + assert np.allclose( + raw_NASC.sv.isel(channel=ch_idx)[0, 0], NASC_echoview, atol=1e-10, rtol=1e-10 + ) + + +# MVBS Tests +@pytest.mark.integration +def test_compute_MVBS_index_binning(ds_Sv_echo_range_regular, regular_data_params): + """Test compute_MVBS_index_binning on mock data""" + + ping_num = 3 # number of pings to average over + range_sample_num = 7 # number of range_samples to average over + nchan = regular_data_params["channel_len"] + npings = regular_data_params["ping_time_len"] + nrange_samples = regular_data_params["depth_len"] + + # Binned MVBS test + ds_MVBS = ep.commongrid.compute_MVBS_index_binning( + ds_Sv_echo_range_regular, range_sample_num=range_sample_num, ping_num=ping_num + ) + + # Shape test + data_binned_shape = np.ceil( + (nchan, npings / ping_num, nrange_samples / range_sample_num) + ).astype(int) + assert np.all(ds_MVBS.Sv.shape == data_binned_shape) + + # Expected values compute + # average should be done in linear domain + da_sv = 10 ** (ds_Sv_echo_range_regular["Sv"] / 10) + expected = 10 * np.log10( + da_sv.coarsen(ping_time=ping_num, range_sample=range_sample_num, boundary="pad").mean( + skipna=True + ) + ) + + # Test all values in MVBS + assert np.array_equal(ds_MVBS.Sv.data, expected.data) + + +@pytest.mark.unit +@pytest.mark.parametrize( + ["range_bin", "ping_time_bin"], [(5, "10S"), ("10m", 10), ("10km", "10S"), ("10", "10S")] +) +def test_compute_MVBS_bin_inputs_fail(ds_Sv_echo_range_regular, range_bin, ping_time_bin): + expected_error = ValueError + if isinstance(range_bin, int) or isinstance(ping_time_bin, int): + expected_error = TypeError + match = r"must be a string" + else: + match = r"Range bin must be in meters" + + with pytest.raises(expected_error, match=match): + ep.commongrid.compute_MVBS( + ds_Sv_echo_range_regular, range_bin=range_bin, ping_time_bin=ping_time_bin + ) + + +@pytest.mark.unit +def test_compute_MVBS_w_latlon(ds_Sv_echo_range_regular_w_latlon, lat_attrs, lon_attrs): + """Testing for compute_MVBS with latitude and longitude""" + from echopype.consolidate.api import POSITION_VARIABLES + + ds_MVBS = ep.commongrid.compute_MVBS( + ds_Sv_echo_range_regular_w_latlon, range_bin="5m", ping_time_bin="10S" + ) + for var in POSITION_VARIABLES: + # Check to ensure variable is in dataset + assert var in ds_MVBS.data_vars + # Check for correct shape, which is just ping time + assert ds_MVBS[var].shape == ds_MVBS.ping_time.shape + + # Check if attributes match + if var == "latitude": + assert ds_MVBS[var].attrs == lat_attrs + elif var == "longitude": + assert ds_MVBS[var].attrs == lon_attrs + + +@pytest.mark.unit +@pytest.mark.parametrize("range_var", ["my_range", "echo_range", "depth"]) +def test_compute_MVBS_invalid_range_var(ds_Sv_echo_range_regular, range_var): + """Test compute MVBS range_var on mock data""" + + if range_var == "my_range": + with pytest.raises(ValueError, match="range_var must be one of 'echo_range' or 'depth'."): + ep.commongrid.compute_MVBS(ds_Sv_echo_range_regular, range_var=range_var) + elif range_var == "depth": + with pytest.raises( + ValueError, match=r"Input Sv dataset must contain all of the following variables" + ): + ep.commongrid.compute_MVBS(ds_Sv_echo_range_regular, range_var=range_var) + else: + pass + + +@pytest.mark.integration +def test_compute_MVBS(test_data_samples): + """ + Test running through from open_raw to compute_MVBS. + """ + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = test_data_samples + ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) + if ed.sonar_model.lower() == "azfp": + avg_temperature = ed["Environment"]["temperature"].values.mean() + env_params = { + "temperature": avg_temperature, + "salinity": 27.9, + "pressure": 59, + } + range_kwargs["env_params"] = env_params + if "azfp_cal_type" in range_kwargs: + range_kwargs.pop("azfp_cal_type") + Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) + ping_time_bin = "20S" + ds_MVBS = ep.commongrid.compute_MVBS(Sv, ping_time_bin=ping_time_bin) + assert ds_MVBS is not None + + # Test to see if ping_time was resampled correctly + expected_ping_time = ( + Sv["ping_time"].resample(ping_time=ping_time_bin, skipna=True).asfreq().indexes["ping_time"] + ) + assert np.array_equal(ds_MVBS.ping_time.data, expected_ping_time.values) + + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_range_output(request, er_type): + """ + Tests the shape of compute_MVBS output on regular and irregular data. + The irregularity in the input echo_range would cause some rows or columns + of the output Sv to contain NaN. + Here we test for the expected shape after dropping the NaNs + for specific ping_time bins. + """ + # set jitter=0 to get predictable number of ping within each echo_range groups + if er_type == "regular": + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular") + else: + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_irregular") + + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin="5m", ping_time_bin="10S") + + if er_type == "regular": + expected_len = ( + ds_Sv["channel"].size, # channel + np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int)) / 1e9 / 10), # ping_time + np.ceil(ds_Sv["echo_range"].max() / 5), # depth + ) + assert ds_MVBS["Sv"].shape == expected_len + else: + assert (ds_MVBS["Sv"].isel(ping_time=slice(None, 3)).dropna(dim="echo_range").shape) == ( + 2, + 3, + 10, + ) # full array, no NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(3, 12)).dropna(dim="echo_range").shape) == ( + 2, + 9, + 7, + ) # bottom bins contain NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(12, None)).dropna(dim="echo_range").shape) == ( + 2, + 6, + 3, + ) # bottom bins contain NaN + + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_values(request, er_type): + """Tests for the values of compute_MVBS on regular and irregular data.""" + + def _parse_nans(mvbs, ds_Sv) -> np.ndarray: + """Go through and figure out nan values in result""" + echo_range_step = np.unique(np.diff(mvbs.Sv.echo_range.values))[0] + expected_outs = [] + # Loop over channels + for chan in mvbs.Sv.channel.values: + # Get ping times for this channel + ping_times = mvbs.Sv.ping_time.values + # Compute the total number of pings + ping_len = len(ping_times) + # Variable to store the expected output for this channel + chan_expected = [] + for idx in range(ping_len): + # Loop over pings and create slices + if idx < ping_len - 1: + ping_slice = slice(ping_times[idx], ping_times[idx + 1]) + else: + ping_slice = slice(ping_times[idx], None) + + # Get the original echo_range data for this channel and ping slice + da = ds_Sv.echo_range.sel(channel=chan, ping_time=ping_slice) + # Drop the nan values since this shouldn't be included in actual + # computation for compute_MVBS, a.k.a. 'nanmean' + mean_values = da.dropna(dim="ping_time", how="all").values + # Compute the histogram of the mean values to get distribution + hist, _ = np.histogram( + mean_values[~np.isnan(mean_values)], + bins=np.append( + mvbs.Sv.echo_range.values, + # Add one more bin to account for the last value + mvbs.Sv.echo_range.values.max() + echo_range_step, + ), + ) + # Convert any non-zero values to False, and zero values to True + # to imitate having nan values since there's no value for that bin + chan_expected.append([False if v > 0 else True for v in hist]) + expected_outs.append(chan_expected) + return np.array(expected_outs) + + range_bin = "2m" + ping_time_bin = "1s" + + if er_type == "regular": + ds_Sv = request.getfixturevalue("mock_Sv_dataset_regular") + expected_mvbs = request.getfixturevalue("mock_mvbs_array_regular") + else: + # Mock irregular dataset contains jitter + # and NaN values in the bottom echo_range + ds_Sv = request.getfixturevalue("mock_Sv_dataset_irregular") + expected_mvbs = request.getfixturevalue("mock_mvbs_array_irregular") + + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin=range_bin, ping_time_bin=ping_time_bin) + + expected_outputs = _parse_nans(ds_MVBS, ds_Sv) + + assert ds_MVBS.Sv.shape == expected_mvbs.shape + # Floating digits need to check with all close not equal + # Compare the values of the MVBS array with the expected values + assert np.allclose(ds_MVBS.Sv.values, expected_mvbs, atol=1e-10, rtol=1e-10, equal_nan=True) + + # Ensures that the computation of MVBS takes doesn't take into account NaN values + # that are sporadically placed in the echo_range values + assert np.array_equal(np.isnan(ds_MVBS.Sv.values), expected_outputs) + + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_NASC_values(request, er_type): + """Tests for the values of compute_NASC on regular and irregular data.""" + + range_bin = "2m" + dist_bin = "0.5nmi" + + if er_type == "regular": + ds_Sv = request.getfixturevalue("mock_Sv_dataset_regular") + expected_nasc = request.getfixturevalue("mock_nasc_array_regular") + else: + # Mock irregular dataset contains jitter + # and NaN values in the bottom echo_range + ds_Sv = request.getfixturevalue("mock_Sv_dataset_irregular") + expected_nasc = request.getfixturevalue("mock_nasc_array_irregular") + + ds_NASC = ep.commongrid.compute_NASC(ds_Sv, range_bin=range_bin, dist_bin=dist_bin) + + assert ds_NASC.NASC.shape == expected_nasc.shape + # Floating digits need to check with all close not equal + # Compare the values of the MVBS array with the expected values + assert np.allclose( + ds_NASC.NASC.values, expected_nasc.values, atol=1e-10, rtol=1e-10, equal_nan=True + ) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py deleted file mode 100644 index 449fe0b9c..000000000 --- a/echopype/tests/commongrid/test_mvbs.py +++ /dev/null @@ -1,835 +0,0 @@ -import dask.array -import numpy as np -from numpy.random import default_rng -import pandas as pd -import pytest -from typing import Tuple, Iterable, Union -import xarray as xr - -import echopype as ep -from echopype.commongrid.mvbs import bin_and_mean_2d - - -@pytest.fixture( - params=[ - ( - ("EK60", "ncei-wcsd", "Summer2017-D20170719-T211347.raw"), - "EK60", - None, - {}, - ), - ( - ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), - "EK80", - None, - {'waveform_mode': 'BB', 'encode_mode': 'complex'}, - ), - ( - ("EK80_NEW", "D20211004-T233354.raw"), - "EK80", - None, - {'waveform_mode': 'CW', 'encode_mode': 'power'}, - ), - ( - ("EK80_NEW", "D20211004-T233115.raw"), - "EK80", - None, - {'waveform_mode': 'CW', 'encode_mode': 'complex'}, - ), - (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), - (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), - ( - ("AD2CP", "raw", "090", "rawtest.090.00001.ad2cp"), - "AD2CP", - None, - {}, - ), - ], - ids=[ - "ek60_cw_power", - "ek80_bb_complex", - "ek80_cw_power", - "ek80_cw_complex", - "es70", - "azfp", - "ad2cp", - ], -) -def test_data_samples(request, test_path): - ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) = request.param - if sonar_model.lower() in ['es70', 'ad2cp']: - pytest.xfail( - reason="Not supported at the moment", - ) - path_model, *paths = filepath - filepath = test_path[path_model].joinpath(*paths) - - if azfp_xml_path is not None: - path_model, *paths = azfp_xml_path - azfp_xml_path = test_path[path_model].joinpath(*paths) - return ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) - - -def _construct_MVBS_toy_data( - nchan, npings, nrange_samples, ping_size, range_sample_size -): - """Construct data with values that increase every ping_num and ``range_sample_num`` - so that the result of computing MVBS is a smaller array - that increases regularly for each resampled ``ping_time`` and ``range_sample`` - - Parameters - ---------- - nchan : int - number of channels - npings : int - number of pings - nrange_samples : int - number of range samples - ping_size : int - number of pings with the same value - range_sample_size : int - number of range samples with the same value - - Returns - ------- - np.ndarray - Array with blocks of ``ping_time`` and ``range_sample`` with the same value, - so that computing the MVBS will result in regularly increasing values - every row and column - """ - data = np.ones((nchan, npings, nrange_samples)) - for p_i, ping in enumerate(range(0, npings, ping_size)): - for r_i, rb in enumerate(range(0, nrange_samples, range_sample_size)): - data[0, ping : ping + ping_size, rb : rb + range_sample_size] += ( - r_i + p_i - ) - # First channel increases by 1 each row and column, second increases by 2, third by 3, etc. - for f in range(nchan): - data[f] = data[0] * (f + 1) - - return data - - -def _construct_MVBS_test_data(nchan, npings, nrange_samples): - """Construct data for testing the toy data from - `_construct_MVBS_toy_data` after it has gone through the - MVBS calculation. - - Parameters - ---------- - nchan : int - number of channels - npings : int - number of pings - nrange_samples : int - number of range samples - - Returns - ------- - np.ndarray - Array with values that increases regularly - every ping and range sample - """ - - # Construct test array - test_array = np.add(*np.indices((npings, nrange_samples))) - return np.array([(test_array + 1) * (f + 1) for f in range(nchan)]) - - -def test_compute_MVBS_index_binning(): - """Test compute_MVBS_index_binning on toy data""" - - # Parameters for toy data - nchan, npings, nrange_samples = 4, 40, 400 - ping_num = 3 # number of pings to average over - range_sample_num = 7 # number of range_samples to average over - - # Construct toy data that increases regularly every ping_num and range_sample_num - data = _construct_MVBS_toy_data( - nchan=nchan, - npings=npings, - nrange_samples=nrange_samples, - ping_size=ping_num, - range_sample_size=range_sample_num, - ) - - data_log = 10 * np.log10(data) # Convert to log domain - chan_index = np.arange(nchan).astype(str) - ping_index = np.arange(npings) - range_sample = np.arange(nrange_samples) - Sv = xr.DataArray( - data_log, - coords=[ - ('channel', chan_index), - ('ping_time', ping_index), - ('range_sample', range_sample), - ], - ) - Sv.name = "Sv" - ds_Sv = Sv.to_dataset() - ds_Sv["frequency_nominal"] = chan_index # just so there's value in freq_nominal - ds_Sv = ds_Sv.assign( - echo_range=xr.DataArray( - np.array([[np.linspace(0, 10, nrange_samples)] * npings] * nchan), - coords=Sv.coords, - ) - ) - - # Binned MVBS test - ds_MVBS = ep.commongrid.compute_MVBS_index_binning( - ds_Sv, range_sample_num=range_sample_num, ping_num=ping_num - ) - data_test = 10 ** (ds_MVBS.Sv / 10) # Convert to linear domain - - # Shape test - data_binned_shape = np.ceil( - (nchan, npings / ping_num, nrange_samples / range_sample_num) - ).astype(int) - assert np.all(data_test.shape == data_binned_shape) - - # Construct test array that increases by 1 for each range_sample and ping_time - test_array = _construct_MVBS_test_data( - nchan, data_binned_shape[1], data_binned_shape[2] - ) - - # Test all values in MVBS - assert np.allclose(data_test, test_array, rtol=0, atol=1e-12) - - -def _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin): - """A collection of tests for test_compute_MVBS""" - - ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv, - range_meter_bin=range_meter_bin, - ping_time_bin=f'{ping_time_bin}S', - ) - - data_test = 10 ** (ds_MVBS.Sv / 10) # Convert to linear domain - - # Shape test - data_binned_shape = np.ceil((nchan, ping_num, range_sample_num)).astype(int) - assert np.all(data_test.shape == data_binned_shape) - - # Construct test array that increases by 1 for each range_sample and ping_time - test_array = _construct_MVBS_test_data( - nchan, data_binned_shape[1], data_binned_shape[2] - ) - - # Test all values in MVBS - assert np.allclose(data_test, test_array, rtol=0, atol=1e-12) - - # Test to see if ping_time was resampled correctly - test_ping_time = pd.date_range( - '1/1/2020', periods=np.ceil(ping_num), freq=f'{ping_time_bin}S' - ) - assert np.array_equal(data_test.ping_time, test_ping_time) - - # Test to see if range was resampled correctly - test_range = np.arange(0, total_range, range_meter_bin) - assert np.array_equal(data_test.echo_range, test_range) - - -def _fill_w_nans(narr, nan_ping_time, nan_range_sample): - """ - A routine that fills a numpy array with nans. - - Parameters - ---------- - narr : numpy array - Array of dimensions (ping_time, range_sample) - nan_ping_time : list - ping times to fill with nans - nan_range_sample: list - range samples to fill with nans - """ - if len(nan_ping_time) != len(nan_range_sample): - raise ValueError('These lists must be the same size!') - - # fill in nans according to the provided lists - for i, j in zip(nan_ping_time, nan_range_sample): - narr[i, j] = np.nan - - return narr - - -def _nan_cases_comp_MVBS(ds_Sv, chan): - """ - For a single channel, obtains numpy array - filled with nans for various cases - """ - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - - # ping times to fill with NaNs - nan_ping_time_1 = [slice(None), slice(None)] - # range samples to fill with NaNs - nan_range_sample_1 = [3, 4] - # pad all ping_times with nans for a certain range_sample - case_1 = _fill_w_nans(one_chan_er, nan_ping_time_1, nan_range_sample_1) - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - # ping times to fill with NaNs - nan_ping_time_2 = [1, 3, 5, 9] - # range samples to fill with NaNs - nan_range_sample_2 = [slice(None), slice(None), slice(None), slice(None)] - # pad all range_samples of certain ping_times - case_2 = _fill_w_nans(one_chan_er, nan_ping_time_2, nan_range_sample_2) - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - # ping times to fill with NaNs - nan_ping_time_3 = [0, 2, 5, 7] - # range samples to fill with NaNs - nan_range_sample_3 = [slice(0, 2), slice(None), slice(None), slice(0, 3)] - # pad all range_samples of certain ping_times and - # pad some ping_times with nans for a certain range_sample - case_3 = _fill_w_nans(one_chan_er, nan_ping_time_3, nan_range_sample_3) - - return case_1, case_2, case_3 - - -def test_compute_MVBS(): - """Test compute_MVBS on toy data""" - - # Parameters for fake data - nchan, npings, nrange_samples = 4, 100, 4000 - range_meter_bin = 7 # range in meters to average over - ping_time_bin = 3 # number of seconds to average over - ping_rate = 2 # Number of pings per second - range_sample_per_meter = 30 # Number of range_samples per meter - - # Useful conversions - ping_num = ( - npings / ping_rate / ping_time_bin - ) # number of pings to average over - range_sample_num = ( - nrange_samples / range_sample_per_meter / range_meter_bin - ) # number of range_samples to average over - total_range = nrange_samples / range_sample_per_meter # total range in meters - - # Construct data with values that increase with range and time - # so that when compute_MVBS is performed, the result is a smaller array - # that increases by a constant for each meter_bin and time_bin - data = _construct_MVBS_toy_data( - nchan=nchan, - npings=npings, - nrange_samples=nrange_samples, - ping_size=ping_rate * ping_time_bin, - range_sample_size=range_sample_per_meter * range_meter_bin, - ) - - data_log = 10 * np.log10(data) # Convert to log domain - chan_index = np.arange(nchan).astype(str) - freq_nom = np.arange(nchan) - # Generate a date range with `npings` number of pings with the frequency of the ping_rate - ping_time = pd.date_range( - '1/1/2020', periods=npings, freq=f'{1/ping_rate}S' - ) - range_sample = np.arange(nrange_samples) - Sv = xr.DataArray( - data_log, - coords=[ - ('channel', chan_index), - ('ping_time', ping_time), - ('range_sample', range_sample), - ], - ) - Sv.name = "Sv" - ds_Sv = Sv.to_dataset() - ds_Sv = ds_Sv.assign( - frequency_nominal=xr.DataArray(freq_nom, coords={'channel': chan_index}), - echo_range=xr.DataArray( - np.array( - [[np.linspace(0, total_range, nrange_samples)] * npings] * nchan - ), - coords=Sv.coords, - ) - ) - - # initial test of compute_MVBS - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # TODO: use @pytest.fixture params/ids - # for multiple similar tests using the same set of parameters - # different nan cases for a single channel - case_1, case_2, case_3 = _nan_cases_comp_MVBS(ds_Sv, chan='0') - - # pad all ping_times with nans for a certain range_sample - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_1 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # pad all range_samples of certain ping_times - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_2 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # pad all range_samples of certain ping_times and - # pad some ping_times with nans for a certain range_sample - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_3 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - -def test_commongrid_mvbs(test_data_samples): - """ - Test running through from open_raw to compute_MVBS. - """ - ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) = test_data_samples - ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) - if ed.sonar_model.lower() == 'azfp': - avg_temperature = ed["Environment"]['temperature'].values.mean() - env_params = { - 'temperature': avg_temperature, - 'salinity': 27.9, - 'pressure': 59, - } - range_kwargs['env_params'] = env_params - if 'azfp_cal_type' in range_kwargs: - range_kwargs.pop('azfp_cal_type') - Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) - assert ep.commongrid.compute_MVBS(Sv) is not None - - -def create_bins(csum_array: np.ndarray) -> Iterable: - """ - Constructs bin ranges based off of a cumulative - sum array. - - Parameters - ---------- - csum_array: np.ndarray - 1D array representing a cumulative sum - - Returns - ------- - bins: list - A list whose elements are the lower and upper bin ranges - """ - - bins = [] - - # construct bins - for count, csum in enumerate(csum_array): - - if count == 0: - - bins.append([0, csum]) - - else: - - # add 0.01 so that left bins don't overlap - bins.append([csum_array[count-1] + 0.01, csum]) - - return bins - - -def create_echo_range_related_data(ping_bins: Iterable, - num_pings_in_bin: np.ndarray, - er_range: list, er_bins: Iterable, - final_num_er_bins: int, - create_dask: bool, - rng: np.random.Generator, - ping_bin_nan_ind: int) -> Tuple[list, list, list]: - """ - Creates ``echo_range`` values and associated bin information. - - Parameters - ---------- - ping_bins: list - A list whose elements are the lower and upper ping time bin ranges - num_pings_in_bin: np.ndarray - Specifies the number of pings in each ping time bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - er_bins: list - A list whose elements are the lower and upper echo range bin ranges - final_num_er_bins: int - The total number of echo range bins - create_dask: bool - If True ``final_arrays`` values will be - dask arrays, else they will be numpy arrays - rng: np.random.Generator - The generator for random values - ping_bin_nan_ind: int - The ping bin index to fill with NaNs - - Returns - ------- - all_er_bin_nums: list of np.ndarray - A list whose elements are the number of values in each echo_range - bin, for each ping bin - ping_times_in_bin: list of np.ndarray - A list whose elements are the ping_time values for each corresponding bin - final_arrays: list of np.ndarray or dask.array.Array - A list whose elements are the echo_range values for a given ping and - echo range bin block - """ - - final_arrays = [] - all_er_bin_nums = [] - ping_times_in_bin = [] - - # build echo_range array - for ping_ind, ping_bin in enumerate(ping_bins): - - # create the ping times associated with each ping bin - ping_times_in_bin.append(rng.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) - - # randomly determine the number of values in each echo_range bin - num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) - - # store the number of values in each echo_range bin - all_er_bin_nums.append(num_er_in_bin) - - er_row_block = [] - for count, bin_val in enumerate(er_bins): - - # create a block of echo_range values - if create_dask: - a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) - else: - a = rng.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) - - # store the block of echo_range values - er_row_block.append(a) - - # set all echo_range values at ping index to NaN - if ping_ind == ping_bin_nan_ind: - a[:, :] = np.nan - - # collect and construct echo_range row block - final_arrays.append(np.concatenate(er_row_block, axis=1)) - - return all_er_bin_nums, ping_times_in_bin, final_arrays - - -def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], - ping_csum: np.ndarray, - create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], int]: - """ - Creates the final 2D ``echo_range`` array with appropriate padding. - - Parameters - ---------- - final_arrays: list of np.ndarray - A list whose elements are the echo_range values for a given ping and - echo range bin block - ping_csum: np.ndarray - 1D array representing the cumulative sum for the number of ping times - in each ping bin - create_dask: bool - If True ``final_er`` will be a dask array, else it will be a numpy array - - Returns - ------- - final_er: np.ndarray or dask.array.Array - The final 2D ``echo_range`` array - max_num_er_elem: int - The maximum number of ``echo_range`` elements amongst all times - """ - - # get maximum number of echo_range elements amongst all times - max_num_er_elem = max([arr.shape[1] for arr in final_arrays]) - - # total number of ping times - tot_num_times = ping_csum[-1] - - # pad echo_range dimension with nans and create final echo_range - if create_dask: - final_er = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan - else: - final_er = np.empty((tot_num_times, max_num_er_elem)) - final_er[:] = np.nan - - for count, arr in enumerate(final_arrays): - - if count == 0: - final_er[0:ping_csum[count], 0:arr.shape[1]] = arr - else: - final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr - - return final_er, max_num_er_elem - - -def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, - all_er_bin_nums: Iterable[np.ndarray], - num_pings_in_bin: np.ndarray, - create_dask: bool, - ping_bin_nan_ind: int) -> Tuple[Union[np.ndarray, dask.array.Array], - np.ndarray]: - """ - Creates the final 2D Sv array with appropriate padding. - - Parameters - ---------- - max_num_er_elem: int - The maximum number of ``echo_range`` elements amongst all times - ping_csum: np.ndarray - 1D array representing the cumulative sum for the number of ping times - in each ping bin - all_er_bin_nums: list of np.ndarray - A list whose elements are the number of values in each echo_range - bin, for each ping bin - num_pings_in_bin: np.ndarray - Specifies the number of pings in each ping time bin - create_dask: bool - If True ``final_sv`` will be a dask array, else it will be a numpy array - ping_bin_nan_ind: int - The ping bin index to fill with NaNs - - Returns - ------- - final_sv: np.ndarray or dask.array.Array - The final 2D Sv array - final_MVBS: np.ndarray - The final 2D known MVBS array - """ - - # total number of ping times - tot_num_times = ping_csum[-1] - - # pad echo_range dimension with nans and create final sv - if create_dask: - final_sv = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan - else: - final_sv = np.empty((tot_num_times, max_num_er_elem)) - final_sv[:] = np.nan - - final_means = [] - for count, arr in enumerate(all_er_bin_nums): - - # create sv row values using natural numbers - sv_row_list = [np.arange(1, num_elem + 1, 1, dtype=np.float64) for num_elem in arr] - - # create final sv row - sv_row = np.concatenate(sv_row_list) - - # get final mean which is n+1/2 (since we are using natural numbers) - ping_mean = [(len(elem) + 1) / 2.0 for elem in sv_row_list] - - # create sv row block - sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1)) - - if count == ping_bin_nan_ind: - - # fill values with NaNs - ping_mean = [np.nan]*len(sv_row_list) - sv_row_block[:, :] = np.nan - - # store means for ping - final_means.append(ping_mean) - - if count == 0: - final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block - else: - final_sv[ping_csum[count - 1]:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block - - # create final sv MVBS - final_MVBS = np.vstack(final_means) - - return final_sv, final_MVBS - - -def create_known_mean_data(final_num_ping_bins: int, - final_num_er_bins: int, - ping_range: list, - er_range: list, create_dask: bool, - rng: np.random.Generator) -> Tuple[np.ndarray, np.ndarray, Iterable, - Iterable, np.ndarray, np.ndarray]: - """ - Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays - where the MVBS is known. - - Parameters - ---------- - final_num_ping_bins: int - The total number of ping time bins - final_num_er_bins: int - The total number of echo range bins - ping_range: list - A list whose first element is the lowest and second element is - the highest possible number of ping time values in a given bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - create_dask: bool - If True the ``Sv`` and ``echo_range`` values produced will be - dask arrays, else they will be numpy arrays. - rng: np.random.Generator - generator for random integers - - Returns - ------- - final_MVBS: np.ndarray - The final 2D known MVBS array - final_sv: np.ndarray - The final 2D Sv array - ping_bins: Iterable - A list whose elements are the lower and upper ping time bin ranges - er_bins: Iterable - A list whose elements are the lower and upper echo range bin ranges - final_er: np.ndarray - The final 2D ``echo_range`` array - final_ping_time: np.ndarray - The final 1D ``ping_time`` array - """ - - # randomly generate the number of pings in each ping bin - num_pings_in_bin = rng.integers(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) - - # create bins for ping_time dimension - ping_csum = np.cumsum(num_pings_in_bin) - ping_bins = create_bins(ping_csum) - - # create bins for echo_range dimension - num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) - er_csum = np.cumsum(num_er_in_bin) - er_bins = create_bins(er_csum) - - # randomly select one ping bin to fill with NaNs - ping_bin_nan_ind = rng.choice(len(ping_bins)) - - # create the echo_range data and associated bin information - all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin, - er_range, er_bins, - final_num_er_bins, - create_dask, - rng, - ping_bin_nan_ind) - - # create the final echo_range array using created data and padding - final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask) - - # get final ping_time dimension - final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') - - # create the final sv array - final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, - all_er_bin_nums, num_pings_in_bin, - create_dask, ping_bin_nan_ind) - - return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time - - -@pytest.fixture( - params=[ - { - "create_dask": True, - "final_num_ping_bins": 10, - "final_num_er_bins": 10, - "ping_range": [10, 1000], - "er_range": [10, 1000] - }, - { - "create_dask": False, - "final_num_ping_bins": 10, - "final_num_er_bins": 10, - "ping_range": [10, 1000], - "er_range": [10, 1000] - }, - ], - ids=[ - "delayed_data", - "in_memory_data" - ], -) -def bin_and_mean_2d_params(request): - """ - Obtains all necessary parameters for ``test_bin_and_mean_2d``. - """ - - return list(request.param.values()) - - -def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: - """ - Tests the function ``bin_and_mean_2d``, which is the core - method for ``compute_MVBS``. This is done by creating mock - data (which can have varying number of ``echo_range`` values - for each ``ping_time``) with known means. - - Parameters - ---------- - create_dask: bool - If True the ``Sv`` and ``echo_range`` values produced will be - dask arrays, else they will be numpy arrays. - final_num_ping_bins: int - The total number of ping time bins - final_num_er_bins: int - The total number of echo range bins - ping_range: list - A list whose first element is the lowest and second element is - the highest possible number of ping time values in a given bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - """ - - # get all parameters needed to create the mock data - create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params - - # randomly generate a seed - seed = np.random.randint(low=10, high=100000, size=1)[0] - - print(f"seed used to generate mock data: {seed}") - - # establish generator for random integers - rng = default_rng(seed=seed) - - # seed dask random generator - if create_dask: - dask.array.random.seed(seed=seed) - - # create echo_range, ping_time, and Sv arrays where the MVBS is known - known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, - final_num_er_bins, - ping_range, er_range, - create_dask, - rng) - - # put the created ping bins into a form that works with bin_and_mean_2d - digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) - digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]') - - # put the created echo range bins into a form that works with bin_and_mean_2d - digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]) - - # calculate MVBS for mock data set - calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, - bins_er=digitize_er_bin, times=final_ping_time, - echo_range=final_er, comprehensive_er_check=True) - - # compare known MVBS solution against its calculated counterpart - assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) diff --git a/echopype/tests/commongrid/test_nasc.py b/echopype/tests/commongrid/test_nasc.py deleted file mode 100644 index 0430f99c1..000000000 --- a/echopype/tests/commongrid/test_nasc.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest - -import numpy as np - -from echopype import open_raw -from echopype.calibrate import compute_Sv -# from echopype.commongrid import compute_NASC -from echopype.commongrid.nasc import ( - get_distance_from_latlon, - get_depth_bin_info, - get_dist_bin_info, - get_distance_from_latlon, -) -from echopype.consolidate import add_location, add_depth - - -@pytest.fixture -def ek60_path(test_path): - return test_path['EK60'] - - -# def test_compute_NASC(ek60_path): -# raw_path = ek60_path / "ncei-wcsd/Summer2017-D20170620-T011027.raw" - -# ed = open_raw(raw_path, sonar_model="EK60") -# ds_Sv = add_depth(add_location(compute_Sv(ed), ed, nmea_sentence="GGA")) -# cell_dist = 0.1 -# cell_depth = 20 -# ds_NASC = compute_NASC(ds_Sv, cell_dist, cell_depth) - -# dist_nmi = get_distance_from_latlon(ds_Sv) - -# # Check dimensions -# da_NASC = ds_NASC["NASC"] -# assert da_NASC.dims == ("channel", "distance", "depth") -# assert np.all(ds_NASC["channel"].values == ds_Sv["channel"].values) -# assert da_NASC["depth"].size == np.ceil(ds_Sv["depth"].max() / cell_depth) -# assert da_NASC["distance"].size == np.ceil(dist_nmi.max() / cell_dist) diff --git a/echopype/tests/convert/test_convert_azfp.py b/echopype/tests/convert/test_convert_azfp.py index 075fde0fb..487bec243 100644 --- a/echopype/tests/convert/test_convert_azfp.py +++ b/echopype/tests/convert/test_convert_azfp.py @@ -10,12 +10,14 @@ from scipy.io import loadmat from echopype import open_raw import pytest +from echopype.convert.parse_azfp import ParseAZFP @pytest.fixture def azfp_path(test_path): return test_path["AZFP"] + def check_platform_required_scalar_vars(echodata): # check convention-required variables in the Platform group for var in [ @@ -154,8 +156,8 @@ def test_convert_azfp_01a_different_ranges(azfp_path): check_platform_required_scalar_vars(echodata) -def test_convert_azfp_01a_notemperature_notilt(azfp_path): - """Test converting file with no valid temperature or tilt data.""" +def test_convert_azfp_01a_no_temperature_pressure_tilt(azfp_path): + """Test converting file with no valid temperature, pressure and tilt data.""" azfp_01a_path = azfp_path / 'rutgers_glider_notemperature/22052500.01A' azfp_xml_path = azfp_path / 'rutgers_glider_notemperature/22052501.XML' @@ -167,8 +169,102 @@ def test_convert_azfp_01a_notemperature_notilt(azfp_path): assert "temperature" in echodata["Environment"] assert echodata["Environment"]["temperature"].isnull().all() + # Pressure variable is present in the Environment group and its values are all nan + assert "pressure" in echodata["Environment"] + assert echodata["Environment"]["pressure"].isnull().all() + # Tilt variables are present in the Platform group and their values are all nan assert "tilt_x" in echodata["Platform"] assert "tilt_y" in echodata["Platform"] assert echodata["Platform"]["tilt_x"].isnull().all() assert echodata["Platform"]["tilt_y"].isnull().all() + + +def test_convert_azfp_01a_pressure_temperature(azfp_path): + """Test converting file with valid pressure and temperature data.""" + azfp_01a_path = azfp_path / 'pressure' / '22042221.01A' + azfp_xml_path = azfp_path / 'pressure' / '22042220.XML' + + echodata = open_raw( + raw_file=azfp_01a_path, sonar_model='AZFP', xml_path=azfp_xml_path + ) + + # Pressure variable is present in the Environment group and its values are not all nan + assert "pressure" in echodata["Environment"] + assert not echodata["Environment"]["pressure"].isnull().all() + + # Temperature variable is present in the Environment group and its values are not all nan + assert "temperature" in echodata["Environment"] + assert not echodata["Environment"]["temperature"].isnull().all() + + +def test_load_parse_azfp_xml(azfp_path): + + azfp_xml_path = azfp_path / '23081211.XML' + parseAZFP = ParseAZFP(None, str(azfp_xml_path)) + parseAZFP.load_AZFP_xml() + expected_params = ['instrument_type_string', 'instrument_type', 'major', 'minor', 'date', + 'program_name', 'program', 'CPU', 'serial_number', 'board_version', + 'file_version', 'parameter_version', 'configuration_version', 'backplane', + 'delay_transmission_string', 'delay_transmission', 'eclock', + 'digital_board_version', 'sensors_flag_pressure_sensor_installed', + 'sensors_flag_paros_installed', 'sensors_flag', 'U0', 'Y1', 'Y2', 'Y3', + 'C1', 'C2', 'C3', 'D1', 'D2', 'T1', 'T2', 'T3', 'T4', 'T5', 'X_a', 'X_b', + 'X_c', 'X_d', 'Y_a', 'Y_b', 'Y_c', 'Y_d', 'period', 'ppm_offset', + 'calibration', 'a0', 'a1', 'a2', 'a3', 'ka', 'kb', 'kc', 'A', 'B', 'C', + 'num_freq', 'kHz_units', 'kHz', 'TVR', 'num_vtx', 'VTX0', 'VTX1', 'VTX2', + 'VTX3', 'BP', 'EL', 'DS', 'min_pulse_len', 'sound_speed', + 'start_date_svalue', 'start_date', 'num_frequencies', 'num_phases', + 'data_output_svalue', 'data_output', 'frequency_units', 'frequency', + 'phase_number', 'start_date_svalue_phase1', 'start_date_phase1', + 'phase_type_svalue_phase1', 'phase_type_phase1', 'duration_svalue_phase1', + 'duration_phase1', 'ping_period_units_phase1', 'ping_period_phase1', + 'burst_interval_units_phase1', 'burst_interval_phase1', + 'pings_per_burst_units_phase1', 'pings_per_burst_phase1', + 'average_burst_pings_units_phase1', 'average_burst_pings_phase1', + 'frequency_number_phase1', 'acquire_frequency_units_phase1', + 'acquire_frequency_phase1', 'pulse_len_units_phase1', 'pulse_len_phase1', + 'dig_rate_units_phase1', 'dig_rate_phase1', 'range_samples_units_phase1', + 'range_samples_phase1', 'range_averaging_samples_units_phase1', + 'range_averaging_samples_phase1', 'lock_out_index_units_phase1', + 'lock_out_index_phase1', 'gain_units_phase1', 'gain_phase1', + 'storage_format_units_phase1', 'storage_format_phase1', + 'start_date_svalue_phase2', 'start_date_phase2', 'phase_type_svalue_phase2', + 'phase_type_phase2', 'duration_svalue_phase2', 'duration_phase2', + 'ping_period_units_phase2', 'ping_period_phase2', + 'burst_interval_units_phase2', 'burst_interval_phase2', + 'pings_per_burst_units_phase2', 'pings_per_burst_phase2', + 'average_burst_pings_units_phase2', 'average_burst_pings_phase2', + 'frequency_number_phase2', 'acquire_frequency_units_phase2', + 'acquire_frequency_phase2', 'pulse_len_units_phase2', 'pulse_len_phase2', + 'dig_rate_units_phase2', 'dig_rate_phase2', 'range_samples_units_phase2', + 'range_samples_phase2', 'range_averaging_samples_units_phase2', + 'range_averaging_samples_phase2', 'lock_out_index_units_phase2', + 'lock_out_index_phase2', 'gain_units_phase2', 'gain_phase2', + 'storage_format_units_phase2', 'storage_format_phase2', 'rt_version', + 'rt_frequency', 'enabled', 'direction', 'water_depth_high_tide', + 'instrument_depth_high_tide'] + assert set(parseAZFP.parameters.keys()) == set(expected_params) + assert list(set(parseAZFP.parameters['instrument_type_string']))[0] == 'AZFP' + assert isinstance(parseAZFP.parameters['num_freq'], int) + assert parseAZFP.parameters['num_freq'] == 4 + assert parseAZFP.parameters['kHz'] == [67, 120, 200, 455] + + expected_len_params = ['acquire_frequency', 'pulse_len', 'dig_rate', + 'range_samples', 'range_averaging_samples', + 'lock_out_index', 'gain', 'storage_format'] + for num in parseAZFP.parameters["phase_number"]: + assert isinstance(parseAZFP.parameters[f"pulse_len_phase{num}"], list) + assert len(parseAZFP.parameters[f"acquire_frequency_phase{num}"]) == 4 + assert all(len(parseAZFP.parameters[f"{x}_phase{num}"]) == 4 for x in expected_len_params) + assert parseAZFP.parameters[f"frequency_number_phase{num}"] == ['1', '2', '3', '4'] + assert parseAZFP.parameters[f"acquire_frequency_phase{num}"] == [1, 1, 1, 1] + assert parseAZFP.parameters[f"dig_rate_phase{num}"] == [20000, 20000, 20000, 20000] + assert parseAZFP.parameters[f"range_averaging_samples_phase{num}"] == [1, 1, 1, 1] + assert parseAZFP.parameters[f"lock_out_index_phase{num}"] == [0, 0, 0, 0] + assert parseAZFP.parameters[f"gain_phase{num}"] == [1, 1, 1, 1] + assert parseAZFP.parameters[f"storage_format_phase{num}"] == [0, 0, 0, 0] + assert parseAZFP.parameters['pulse_len_phase1'] == [1000, 1000, 1000, 1000] + assert parseAZFP.parameters['pulse_len_phase2'] == [0, 0, 0, 0] + assert parseAZFP.parameters['range_samples_phase1'] == [8273, 8273, 8273, 8273] + assert parseAZFP.parameters['range_samples_phase2'] == [2750, 2750, 2750, 2750] diff --git a/echopype/tests/convert/test_convert_source_target_locs.py b/echopype/tests/convert/test_convert_source_target_locs.py index 66cbbfe40..6b1af93e0 100644 --- a/echopype/tests/convert/test_convert_source_target_locs.py +++ b/echopype/tests/convert/test_convert_source_target_locs.py @@ -236,7 +236,7 @@ def test_convert_time_encodings(sonar_model, raw_file, xml_path, test_path): xml_path = str(test_path[path_model].joinpath(*xml_path).absolute()) ed = open_raw( - sonar_model=sonar_model, raw_file=raw_file, xml_path=xml_path + sonar_model=sonar_model, raw_file=raw_file, xml_path=xml_path, destination_path="no_swap" ) ed.to_netcdf(overwrite=True) for group, details in ed.group_map.items(): @@ -297,6 +297,7 @@ def test_convert_ek( raw_file=ipath, sonar_model=sonar_model, storage_options=input_storage_options, + destination_path="no_swap" ) if ( @@ -370,6 +371,7 @@ def test_convert_azfp( xml_path=azfp_xml_paths, sonar_model=model, storage_options=input_storage_options, + destination_path="no_swap" ) assert echodata.xml_path == azfp_xml_paths diff --git a/echopype/tests/convert/test_parsed_to_zarr.py b/echopype/tests/convert/test_parsed_to_zarr.py index 8ae96c5e7..1fb30ebc4 100644 --- a/echopype/tests/convert/test_parsed_to_zarr.py +++ b/echopype/tests/convert/test_parsed_to_zarr.py @@ -1,18 +1,97 @@ import pytest import xarray as xr -from typing import List, Tuple +import pandas as pd +from typing import List, Optional, Tuple from echopype import open_raw -from pathlib import Path +import shutil +from zarr.hierarchy import Group as ZGroup import os.path +from fsspec import FSMap +from s3fs import S3FileSystem +import requests +import time +from echopype.convert.parsed_to_zarr import Parsed2Zarr, DEFAULT_ZARR_TEMP_DIR +from echopype.convert.parsed_to_zarr_ek60 import Parsed2ZarrEK60 +from echopype.echodata.convention import sonarnetcdf_1 +from echopype.convert.api import _check_file, SONAR_MODELS + +pytestmark = pytest.mark.skip(reason="Removed Parsed2Zarr") + +test_bucket_name = "echopype-test" +port = 5555 +endpoint_uri = "http://127.0.0.1:%s/" % port + + +@pytest.fixture() +def s3_base(): + # writable local S3 system + import shlex + import subprocess + + try: + # should fail since we didn't start server yet + r = requests.get(endpoint_uri) + except: # noqa + pass + else: + if r.ok: + raise RuntimeError("moto server already up") + if "AWS_SECRET_ACCESS_KEY" not in os.environ: + os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" + if "AWS_ACCESS_KEY_ID" not in os.environ: + os.environ["AWS_ACCESS_KEY_ID"] = "foo" + proc = subprocess.Popen( + shlex.split("moto_server s3 -p %s" % port), + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, + ) + + timeout = 5 + while timeout > 0: + try: + print("polling for moto server") + r = requests.get(endpoint_uri) + if r.ok: + break + except: # noqa + pass + timeout -= 0.1 + time.sleep(0.1) + print("server up") + yield + print("moto done") + proc.terminate() + proc.wait() + + +def get_boto3_client(): + from botocore.session import Session + + # NB: we use the sync botocore client for setup + session = Session() + return session.create_client("s3", endpoint_url=endpoint_uri) + + +@pytest.fixture() +def s3(s3_base): + client = get_boto3_client() + client.create_bucket(Bucket=test_bucket_name, ACL="public-read") + + S3FileSystem.clear_instance_cache() + s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) + s3.invalidate_cache() + yield s3 @pytest.fixture def ek60_path(test_path): - return test_path['EK60'] + return test_path["EK60"] -def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, - var_to_comp: List[str], ed_path) -> Tuple[xr.Dataset, xr.Dataset]: +def compare_zarr_vars( + ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, var_to_comp: List[str], ed_path +) -> Tuple[xr.Dataset, xr.Dataset]: """ Compares the dask variables in ``ed_zarr`` against their counterparts in ``ed_no_zarr`` by computing the dask results @@ -41,9 +120,7 @@ def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, """ for var in var_to_comp: - for chan in ed_zarr[ed_path][var].channel: - # here we compute to make sure values are being compared, rather than just shapes var_zarr = ed_zarr[ed_path][var].sel(channel=chan).compute() var_no_zarr = ed_no_zarr[ed_path][var].sel(channel=chan) @@ -56,13 +133,13 @@ def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, @pytest.mark.parametrize( - ["raw_file", "sonar_model", "use_swap"], + ["raw_file", "sonar_model", "destination_path"], [ - ("L0003-D20040909-T161906-EK60.raw", "EK60", True), + ("L0003-D20040909-T161906-EK60.raw", "EK60", "swap"), pytest.param( "L0003-D20040909-T161906-EK60.raw", "EK60", - False, + "no_swap", marks=pytest.mark.xfail( run=False, reason="Expected out of memory error. See /~https://github.com/OSOceanAcoustics/echopype/issues/489", @@ -71,18 +148,17 @@ def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, ], ids=["noaa_offloaded", "noaa_not_offloaded"], ) -def test_raw2zarr(raw_file, sonar_model, use_swap, ek60_path): +def test_raw2zarr(raw_file, sonar_model, destination_path, ek60_path): """Tests for memory expansion relief""" import os from tempfile import TemporaryDirectory from echopype.echodata.echodata import EchoData - name = os.path.basename(raw_file).replace('.raw', '') - fname = f"{name}__{use_swap}.zarr" + + name = os.path.basename(raw_file).replace(".raw", "") + fname = f"{name}__{destination_path}.zarr" file_path = ek60_path / raw_file echodata = open_raw( - raw_file=file_path, - sonar_model=sonar_model, - use_swap=use_swap + raw_file=file_path, sonar_model=sonar_model, destination_path=destination_path ) # Most likely succeed if it doesn't crash assert isinstance(echodata, EchoData) @@ -92,14 +168,13 @@ def test_raw2zarr(raw_file, sonar_model, use_swap, ek60_path): # If it goes all the way to here it is most likely successful assert os.path.exists(output_save_path) - if use_swap: - # create a copy of zarr_file_name. The join is necessary so that we are not referencing zarr_file_name - temp_zarr_path = ''.join(echodata.parsed2zarr_obj.zarr_file_name) + if echodata.parsed2zarr_obj.store is not None: + temp_zarr_path = echodata.parsed2zarr_obj.store del echodata # make sure that the temporary zarr was deleted - assert Path(temp_zarr_path).exists() is False + assert temp_zarr_path.fs.exists(temp_zarr_path.root) is False @pytest.mark.parametrize( @@ -107,16 +182,23 @@ def test_raw2zarr(raw_file, sonar_model, use_swap, ek60_path): [ ("EK60", os.path.join("ncei-wcsd", "Summer2017-D20170615-T190214.raw"), "EK60"), ("EK60", "DY1002_EK60-D20100318-T023008_rep_freq.raw", "EK60"), - ("EK80", "Summer2018--D20180905-T033113.raw", "EK80"), + ("EK80", "Summer2018--D20180905-T033113.raw", "EK80"), ("EK80_CAL", "2018115-D20181213-T094600.raw", "EK80"), - ("EK80", "Green2.Survey2.FM.short.slow.-D20191004-T211557.raw", "EK80"), - ("EK80", "2019118 group2survey-D20191214-T081342.raw", "EK80"), + ("EK80", "Green2.Survey2.FM.short.slow.-D20191004-T211557.raw", "EK80"), + ("EK80", "2019118 group2survey-D20191214-T081342.raw", "EK80"), + ], + ids=[ + "ek60_summer_2017", + "ek60_rep_freq", + "ek80_summer_2018", + "ek80_bb_w_cal", + "ek80_short_slow", + "ek80_grp_2_survey", ], - ids=["ek60_summer_2017", "ek60_rep_freq", "ek80_summer_2018", - "ek80_bb_w_cal", "ek80_short_slow", "ek80_grp_2_survey"], ) -def test_direct_to_zarr_integration(path_model: str, raw_file: str, - sonar_model: str, test_path: dict) -> None: +def test_direct_to_zarr_integration( + path_model: str, raw_file: str, sonar_model: str, test_path: dict +) -> None: """ Integration Test that ensure writing variables directly to a temporary zarr store and then assigning @@ -144,11 +226,10 @@ def test_direct_to_zarr_integration(path_model: str, raw_file: str, raw_file_path = test_path[path_model] / raw_file - ed_zarr = open_raw(raw_file_path, sonar_model=sonar_model, use_swap=True, max_mb=100) - ed_no_zarr = open_raw(raw_file_path, sonar_model=sonar_model, use_swap=False) + ed_zarr = open_raw(raw_file_path, sonar_model=sonar_model, max_mb=100, destination_path="swap") + ed_no_zarr = open_raw(raw_file_path, sonar_model=sonar_model) for grp in ed_zarr.group_paths: - # remove conversion time so we can do a direct comparison if "conversion_time" in ed_zarr[grp].attrs: del ed_zarr[grp].attrs["conversion_time"] @@ -156,24 +237,193 @@ def test_direct_to_zarr_integration(path_model: str, raw_file: str, # Compare angle, power, complex, if zarr drop the zarr variables and compare datasets if grp == "Sonar/Beam_group2": - var_to_comp = ['angle_athwartship', 'angle_alongship', 'backscatter_r'] + var_to_comp = ["angle_athwartship", "angle_alongship", "backscatter_r"] ed_zarr, ed_no_zarr = compare_zarr_vars(ed_zarr, ed_no_zarr, var_to_comp, grp) if grp == "Sonar/Beam_group1": - - if 'backscatter_i' in ed_zarr[grp]: - var_to_comp = ['backscatter_r', 'backscatter_i'] + if "backscatter_i" in ed_zarr[grp]: + var_to_comp = ["backscatter_r", "backscatter_i"] else: - var_to_comp = ['angle_athwartship', 'angle_alongship', 'backscatter_r'] + var_to_comp = ["angle_athwartship", "angle_alongship", "backscatter_r"] ed_zarr, ed_no_zarr = compare_zarr_vars(ed_zarr, ed_no_zarr, var_to_comp, grp) - assert ed_zarr[grp].identical(ed_no_zarr[grp]) + assert ed_zarr[grp] is not None - # create a copy of zarr_file_name. The join is necessary so that we are not referencing zarr_file_name - temp_zarr_path = ''.join(ed_zarr.parsed2zarr_obj.zarr_file_name) + if ed_zarr.parsed2zarr_obj.store is not None: + temp_zarr_path = ed_zarr.parsed2zarr_obj.store - del ed_zarr + del ed_zarr - # make sure that the temporary zarr was deleted - assert Path(temp_zarr_path).exists() is False + # make sure that the temporary zarr was deleted + assert temp_zarr_path.fs.exists(temp_zarr_path.root) is False + + +class TestParsed2Zarr: + sample_file = "L0003-D20040909-T161906-EK60.raw" + sonar_model = "EK60" + xml_path = None + convert_params = None + storage_options = {} + max_mb = 100 + ek60_expected_shapes = { + "angle_alongship": (9923, 3, 10417), + "angle_athwartship": (9923, 3, 10417), + "channel": (3,), + "timestamp": (9923,), + "power": (9923, 3, 10417), + } + + @pytest.fixture(scope="class") + def ek60_parsed2zarr_obj(self, ek60_parser_obj): + return Parsed2ZarrEK60(ek60_parser_obj) + + @pytest.fixture(scope="class") + def ek60_parsed2zarr_obj_w_df(self, ek60_parsed2zarr_obj): + ek60_parsed2zarr_obj._create_zarr_info() + ek60_parsed2zarr_obj.datagram_df = pd.DataFrame.from_dict( + ek60_parsed2zarr_obj.parser_obj.zarr_datagrams + ) + # convert channel column to a string + ek60_parsed2zarr_obj.datagram_df["channel"] = ek60_parsed2zarr_obj.datagram_df["channel"].astype(str) + yield ek60_parsed2zarr_obj + + def _get_storage_options(self, dest_path: Optional[str]) -> Optional[dict]: + """Retrieve storage options for destination path""" + dest_storage_options = None + if dest_path is not None and "s3://" in dest_path: + dest_storage_options = {"anon": False, "client_kwargs": {"endpoint_url": endpoint_uri}} + return dest_storage_options + + @pytest.fixture(scope="class") + def ek60_parser_obj(self, test_path): + folder_path = test_path[self.sonar_model] + raw_file = str(folder_path / self.sample_file) + + file_chk, xml_chk = _check_file( + raw_file, self.sonar_model, self.xml_path, self.storage_options + ) + + if SONAR_MODELS[self.sonar_model]["xml"]: + params = xml_chk + else: + params = "ALL" + + # obtain dict associated with directly writing to zarr + dgram_zarr_vars = SONAR_MODELS[self.sonar_model]["dgram_zarr_vars"] + + # Parse raw file and organize data into groups + parser = SONAR_MODELS[self.sonar_model]["parser"]( + file_chk, + params=params, + storage_options=self.storage_options, + dgram_zarr_vars=dgram_zarr_vars, + ) + + # Parse the data + parser.parse_raw() + return parser + + @pytest.mark.parametrize( + ["sonar_model", "p2z_class"], + [ + (None, Parsed2Zarr), + ("EK60", Parsed2ZarrEK60), + ], + ) + def test_constructor(self, sonar_model, p2z_class, ek60_parser_obj): + if sonar_model is None: + p2z = p2z_class(None) + assert p2z.parser_obj is None + assert p2z.temp_zarr_dir is None + assert p2z.zarr_file_name is None + assert p2z.store is None + assert p2z.zarr_root is None + assert p2z._varattrs == sonarnetcdf_1.yaml_dict["variable_and_varattributes"] + else: + p2z = p2z_class(ek60_parser_obj) + assert isinstance(p2z.parser_obj, SONAR_MODELS[self.sonar_model]["parser"]) + assert p2z.sonar_model == self.sonar_model + + @pytest.mark.parametrize("dest_path", [None, "./", f"s3://{test_bucket_name}/my-dir/"]) + def test__create_zarr_info(self, ek60_parsed2zarr_obj, dest_path, s3): + dest_storage_options = self._get_storage_options(dest_path) + + ek60_parsed2zarr_obj._create_zarr_info(dest_path, dest_storage_options=dest_storage_options) + + zarr_store = ek60_parsed2zarr_obj.store + zarr_root = ek60_parsed2zarr_obj.zarr_root + + assert isinstance(zarr_store, FSMap) + assert isinstance(zarr_root, ZGroup) + assert zarr_store.root.endswith(".zarr") + + if dest_path is None: + assert os.path.dirname(zarr_store.root) == str(DEFAULT_ZARR_TEMP_DIR) + assert ek60_parsed2zarr_obj.temp_zarr_dir == str(DEFAULT_ZARR_TEMP_DIR) + elif "s3://" not in dest_path: + shutil.rmtree(zarr_store.root) + + def test__close_store(self, ek60_parsed2zarr_obj): + ek60_parsed2zarr_obj._create_zarr_info() + + zarr_store = ek60_parsed2zarr_obj.store + + # Initially metadata should not exist + assert not zarr_store.fs.exists(zarr_store.root + "/.zmetadata") + + # Close store will consolidate metadata + ek60_parsed2zarr_obj._close_store() + + # Now metadata should exist + assert zarr_store.fs.exists(zarr_store.root + "/.zmetadata") + + def test__write_power(self, ek60_parsed2zarr_obj_w_df): + # There shouldn't be any group here + assert "power" not in ek60_parsed2zarr_obj_w_df.zarr_root + + ek60_parsed2zarr_obj_w_df._write_power( + df=ek60_parsed2zarr_obj_w_df.datagram_df, + max_mb=self.max_mb + ) + + # There should now be power group + assert "power" in ek60_parsed2zarr_obj_w_df.zarr_root + + for k, arr in ek60_parsed2zarr_obj_w_df.zarr_root["/power"].arrays(): + assert arr.shape == self.ek60_expected_shapes[k] + + def test__write_angle(self, ek60_parsed2zarr_obj_w_df): + # There shouldn't be any group here + assert "angle" not in ek60_parsed2zarr_obj_w_df.zarr_root + + ek60_parsed2zarr_obj_w_df._write_angle( + df=ek60_parsed2zarr_obj_w_df.datagram_df, + max_mb=self.max_mb + ) + # There should now be angle group + assert "angle" in ek60_parsed2zarr_obj_w_df.zarr_root + + for k, arr in ek60_parsed2zarr_obj_w_df.zarr_root["/angle"].arrays(): + assert arr.shape == self.ek60_expected_shapes[k] + + def test_power_dataarray(self, ek60_parsed2zarr_obj_w_df): + power_dataarray = ek60_parsed2zarr_obj_w_df.power_dataarray + assert isinstance(power_dataarray, xr.DataArray) + + assert power_dataarray.name == "backscatter_r" + assert power_dataarray.dims == ("ping_time", "channel", "range_sample") + assert power_dataarray.shape == self.ek60_expected_shapes["power"] + + def test_angle_dataarrays(self, ek60_parsed2zarr_obj_w_df): + angle_athwartship, angle_alongship = ek60_parsed2zarr_obj_w_df.angle_dataarrays + assert isinstance(angle_athwartship, xr.DataArray) + assert isinstance(angle_alongship, xr.DataArray) + + assert angle_alongship.name == "angle_alongship" + assert angle_alongship.dims == ("ping_time", "channel", "range_sample") + assert angle_alongship.shape == self.ek60_expected_shapes["angle_alongship"] + + assert angle_athwartship.name == "angle_athwartship" + assert angle_athwartship.dims == ("ping_time", "channel", "range_sample") + assert angle_athwartship.shape == self.ek60_expected_shapes["angle_athwartship"] diff --git a/echopype/tests/echodata/test_echodata.py b/echopype/tests/echodata/test_echodata.py index e78c007eb..d58800ba9 100644 --- a/echopype/tests/echodata/test_echodata.py +++ b/echopype/tests/echodata/test_echodata.py @@ -1,6 +1,5 @@ from textwrap import dedent -import os import fsspec from pathlib import Path import shutil @@ -9,7 +8,6 @@ from zarr.errors import GroupNotFoundError import echopype -from echopype.calibrate.env_params_old import EnvParams from echopype.echodata import EchoData from echopype import open_converted from echopype.calibrate.calibrate_ek import CalibrateEK60, CalibrateEK80 @@ -325,8 +323,8 @@ def test_get_dataset(self, converted_zarr): def test_to_zarr_consolidated(self, mock_echodata, consolidated): """ Tests to_zarr consolidation. Currently, this test uses a mock EchoData object that only - has attributes. The consolidated flag provided will be used in every to_zarr call (which - is used to write each EchoData group to zarr_path). + has attributes. The consolidated flag provided will be used in every to_zarr call (which + is used to write each EchoData group to zarr_path). """ zarr_path = Path('test.zarr') mock_echodata.to_zarr(str(zarr_path), consolidated=consolidated, overwrite=True) @@ -693,3 +691,34 @@ def test_update_platform_no_update(test_path): variable_mappings = {"longitude": "longitude", "latitude": "latitude"} ed.update_platform(extra_platform_data, variable_mappings=variable_mappings) + +def test_update_platform_latlon_notimestamp(test_path): + raw_file = test_path["EK60"] / "ooi" / "CE02SHBP-MJ01C-07-ZPLSCB101_OOI-D20191201-T000000.raw" + ed = echopype.open_raw(raw_file, sonar_model="EK60") + + extra_platform_data = xr.Dataset( + { + "lon": ([], float(-100.0)), + "lat": ([], float(-50.0)), + } + ) + + platform_preexisting_dims = ed["Platform"].dims + + # variable names in mappings different from actual external dataset + variable_mappings = {"longitude": "lon", "latitude": "lat"} + + ed.update_platform(extra_platform_data, variable_mappings=variable_mappings) + + # Updated variables are not all nan + for variable in variable_mappings.keys(): + assert not np.isnan(ed["Platform"][variable].values).all() + + # Number of dimensions in Platform group should be as previous + assert len(ed["Platform"].dims) == len(platform_preexisting_dims) + + # Dimension assignment + assert ed["Platform"]["longitude"].dims[0] == ed["Platform"]["latitude"].dims[0] + assert ed["Platform"]["longitude"].dims[0] in platform_preexisting_dims + assert ed["Platform"]["latitude"].dims[0] in platform_preexisting_dims + assert ed['Platform']['longitude'].coords['time1'].values[0] == ed['Sonar/Beam_group1'].ping_time.data[0] diff --git a/echopype/tests/echodata/utils.py b/echopype/tests/echodata/utils.py index 011dafeeb..31bcf93ce 100644 --- a/echopype/tests/echodata/utils.py +++ b/echopype/tests/echodata/utils.py @@ -10,6 +10,10 @@ from echopype.convert.set_groups_base import SetGroupsBase from echopype.echodata.echodata import EchoData +from echopype.echodata.convention import sonarnetcdf_1 + +class P2Z: + _varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] class SetGroupsTest(SetGroupsBase): @@ -95,6 +99,7 @@ def get_mock_echodata( output_path=None, sonar_model=sonar_model, params={"survey_name": "mock_survey"}, + parsed2zarr_obj=P2Z(), ) tree_dict["/"] = setgrouper.set_toplevel( sonar_model, date_created=np.datetime64("1970-01-01") diff --git a/echopype/tests/utils/test_processinglevels_integration.py b/echopype/tests/utils/test_processinglevels_integration.py index 0dadbfc87..10c81c9eb 100644 --- a/echopype/tests/utils/test_processinglevels_integration.py +++ b/echopype/tests/utils/test_processinglevels_integration.py @@ -127,8 +127,6 @@ def _freqdiff_applymask(test_ds): # ---- Compute MVBS # compute_MVBS expects a variable named "Sv" - # No product level is assigned because at present compute_MVBS drops the lat/lon data - # associated with the input Sv dataset # ds = ds.rename_vars(name_dict={"Sv": "Sv_unmasked", "Sv_ch0": "Sv"}) - mvbs_ds = ep.commongrid.compute_MVBS(ds, range_meter_bin=30, ping_time_bin='1min') - _absence_test(mvbs_ds) + mvbs_ds = ep.commongrid.compute_MVBS(ds, range_bin="30m", ping_time_bin='1min') + _presence_test(mvbs_ds, "Level 3B") diff --git a/echopype/tests/utils/test_utils_misc.py b/echopype/tests/utils/test_utils_misc.py new file mode 100644 index 000000000..5765acac9 --- /dev/null +++ b/echopype/tests/utils/test_utils_misc.py @@ -0,0 +1,31 @@ +import pytest + +import numpy as np +from echopype.utils.misc import depth_from_pressure + + +def test_depth_from_pressure(): + # A single pressure value and defaults for the other arguments + pressure = 100.0 + depth = depth_from_pressure(pressure) + assert np.isclose(depth, 99.2954) + + # Array of pressure and list of latitude values + pressure = np.array([100.0, 101.0, 101.0]) + latitude = [0.0, 30.0, 50.0] + depth = depth_from_pressure(pressure, latitude) + assert np.allclose(depth, [99.4265, 100.2881, 100.1096]) + + # Scalars specified for all 3 arguments + pressure = 1000.0 + latitude = 0.0 + atm_pres_surf = 10.1325 # standard atm pressure at sea level + depth = depth_from_pressure(pressure, latitude, atm_pres_surf) + assert np.isclose(depth, 982.0882) + + # ValueError triggered by argument arrays having different lengths + pressure = np.array([100.0, 101.0, 101.0]) + latitude = [0.0, 30.0] + with pytest.raises(ValueError) as excinfo: + depth = depth_from_pressure(pressure, latitude) + assert str(excinfo.value) == "Sequence shape or size does not match pressure" diff --git a/echopype/utils/coding.py b/echopype/utils/coding.py index 626aeb6e5..8679296e3 100644 --- a/echopype/utils/coding.py +++ b/echopype/utils/coding.py @@ -208,6 +208,9 @@ def set_zarr_encodings( chunks = _get_auto_chunk(val, chunk_size=chunk_size) encoding[name]["chunks"] = chunks + if PREFERRED_CHUNKS in encoding[name]: + # Remove 'preferred_chunks', use chunks only instead + encoding[name].pop(PREFERRED_CHUNKS) return encoding diff --git a/echopype/utils/compute.py b/echopype/utils/compute.py new file mode 100644 index 000000000..936a1187d --- /dev/null +++ b/echopype/utils/compute.py @@ -0,0 +1,41 @@ +"""compute.py + +Module containing various helper functions +for performing computations within echopype. +""" +from typing import Union + +import dask.array +import numpy as np + + +def _log2lin(data: Union[dask.array.Array, np.ndarray]) -> Union[dask.array.Array, np.ndarray]: + """Perform log to linear transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + return 10 ** (data / 10) + + +def _lin2log(data: Union[dask.array.Array, np.ndarray]) -> Union[dask.array.Array, np.ndarray]: + """Perform linear to log transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + return 10 * np.log10(data) diff --git a/echopype/utils/io.py b/echopype/utils/io.py index 3b93ee91e..236d3a233 100644 --- a/echopype/utils/io.py +++ b/echopype/utils/io.py @@ -11,6 +11,7 @@ import fsspec import xarray as xr +from dask.array import Array as DaskArray from fsspec import FSMap from fsspec.implementations.local import LocalFileSystem @@ -61,7 +62,8 @@ def save_file(ds, path, mode, engine, group=None, compression_settings=None, **k elif engine == "zarr": # Ensure that encoding and chunks match for var, enc in encoding.items(): - ds[var] = ds[var].chunk(enc.get("chunks", {})) + if isinstance(ds[var].data, DaskArray): + ds[var] = ds[var].chunk(enc.get("chunks", {})) ds.to_zarr(store=path, mode=mode, group=group, encoding=encoding, **kwargs) else: raise ValueError(f"{engine} is not a supported save format") diff --git a/echopype/utils/misc.py b/echopype/utils/misc.py new file mode 100644 index 000000000..c4aab43c8 --- /dev/null +++ b/echopype/utils/misc.py @@ -0,0 +1,87 @@ +from typing import List, Optional, Tuple, Union + +import numpy as np +from numpy.typing import NDArray + +FloatSequence = Union[List[float], Tuple[float], NDArray[float]] + + +def camelcase2snakecase(camel_case_str): + """ + Convert string from CamelCase to snake_case + e.g. CamelCase becomes camel_case. + """ + idx = list(reversed([i for i, c in enumerate(camel_case_str) if c.isupper()])) + param_len = len(camel_case_str) + for i in idx: + # check if we should insert an underscore + if i > 0 and i < param_len: + camel_case_str = camel_case_str[:i] + "_" + camel_case_str[i:] + + return camel_case_str.lower() + + +def depth_from_pressure( + pressure: Union[float, FloatSequence], + latitude: Optional[Union[float, FloatSequence]] = 30.0, + atm_pres_surf: Optional[Union[float, FloatSequence]] = 0.0, +) -> NDArray[float]: + """ + Convert pressure to depth using UNESCO 1983 algorithm. + + UNESCO. 1983. Algorithms for computation of fundamental properties of seawater (Pressure to + Depth conversion, pages 25-27). Prepared by Fofonoff, N.P. and Millard, R.C. UNESCO technical + papers in marine science, 44. http://unesdoc.unesco.org/images/0005/000598/059832eb.pdf + + Parameters + ---------- + pressure : Union[float, FloatSequence] + Pressure in dbar + latitude : Union[float, FloatSequence], default=30.0 + Latitude in decimal degrees. + atm_pres_surf : Union[float, FloatSequence], default=0.0 + Atmospheric pressure at the surface in dbar. + Use the default 0.0 value if pressure is corrected to be 0 at the surface. + Otherwise, enter a correction for pressure due to air, sea ice and any other + medium that may be present + + Returns + ------- + depth : NDArray[float] + Depth in meters + """ + + def _as_nparray_check(v, check_vs_pressure=False): + """ + Convert to np.array if not already a np.array. + Ensure latitude and atm_pres_surf are of the same size and shape as + pressure if they are not scalar. + """ + v_array = np.array(v) if not isinstance(v, np.ndarray) else v + if check_vs_pressure: + if v_array.size != 1: + if v_array.size != pressure.size or v_array.shape != pressure.shape: + raise ValueError("Sequence shape or size does not match pressure") + return v_array + + pressure = _as_nparray_check(pressure) + latitude = _as_nparray_check(latitude, check_vs_pressure=True) + atm_pres_surf = _as_nparray_check(atm_pres_surf, check_vs_pressure=True) + + # Constants + g = 9.780318 + c1 = 9.72659 + c2 = -2.2512e-5 + c3 = 2.279e-10 + c4 = -1.82e-15 + k1 = 5.2788e-3 + k2 = 2.36e-5 + k3 = 1.092e-6 + + # Calculate depth + pressure = pressure - atm_pres_surf + depth_w_g = c1 * pressure + c2 * pressure**2 + c3 * pressure**3 + c4 * pressure**4 + x = np.sin(np.deg2rad(latitude)) + gravity = g * (1.0 + k1 * x**2 + k2 * x**4) + k3 * pressure + depth = depth_w_g / gravity + return depth diff --git a/pytest.ini b/pytest.ini index 7a3a8bfd2..9ad97f3a7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,7 @@ # test directory [pytest] testpaths = echopype/tests - cache_dir = .cache +markers = + unit: marks tests as unit tests + integration: marks tests as integration tests diff --git a/requirements-dev.txt b/requirements-dev.txt index fe3bd8841..80fbb3f60 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,6 +9,7 @@ flake8-mutable flake8-print isort mypy +moto numpydoc pre-commit pylint diff --git a/requirements.txt b/requirements.txt index eb5ca32c8..09ba1a7c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,5 +14,5 @@ requests aiohttp xarray-datatree==0.0.6 psutil>=5.9.1 -more-itertools==8.13.0 geopy +flox>=0.7.2,<1.0.0