Migrate dfencoder to morpheus repo (#763)

* Copy dfencoder code, ensuring correct copyright headers * Add a `manual_seed` helper method, replacing duplicated code * Add tests for dfencoder * Insert both our Apache license, and the original author's BSD license in each of the 4 python files (and the `__init__.py`). The original code didn't have that, but this conforms with our other 3rd party code inclusions. * Transfer compiled C++ tests and libs (fixes #765) fixes #753 Authors: - David Gardner (/~https://github.com/dagardner-nv) - Michael Demoret (/~https://github.com/mdemoret-nv) Approvers: - Michael Demoret (/~https://github.com/mdemoret-nv) URL: #763
nv-morpheus · Mar 23, 2023 · 33e922c · 33e922c
1 parent ca66ec0
commit 33e922c
Show file tree

Hide file tree

Showing 44 changed files with 2,643 additions and 224 deletions.
diff --git a/ci/scripts/github/build.sh b/ci/scripts/github/build.sh
@@ -30,6 +30,7 @@ sccache --version
 
 rapids-logger "Configuring cmake for Morpheus"
 git submodule update --init --recursive
+
 cmake -B build -G Ninja ${CMAKE_BUILD_ALL_FEATURES} \
     -DCCACHE_PROGRAM_PATH=$(which sccache) \
     -DMORPHEUS_PYTHON_BUILD_WHEEL=ON \
@@ -45,8 +46,16 @@ sccache --show-stats
 rapids-logger "Archiving results"
 tar cfj "${WORKSPACE_TMP}/wheel.tar.bz" build/dist
 
+MORPHEUS_LIBS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;))
+tar cfj "${WORKSPACE_TMP}/morhpeus_libs.tar.bz" "${MORPHEUS_LIBS[@]}"
+
+CPP_TESTS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib/tests -name "*.x" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;))
+tar cfj "${WORKSPACE_TMP}/cpp_tests.tar.bz" "${CPP_TESTS[@]}"
+
 rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}"
 aws s3 cp --no-progress "${WORKSPACE_TMP}/wheel.tar.bz" "${ARTIFACT_URL}/wheel.tar.bz"
+aws s3 cp --no-progress "${WORKSPACE_TMP}/morhpeus_libs.tar.bz" "${ARTIFACT_URL}/morhpeus_libs.tar.bz"
+aws s3 cp --no-progress "${WORKSPACE_TMP}/cpp_tests.tar.bz" "${ARTIFACT_URL}/cpp_tests.tar.bz"
 
 rapids-logger "Success"
 exit 0
diff --git a/ci/scripts/github/test.sh b/ci/scripts/github/test.sh
@@ -22,16 +22,20 @@ source ${WORKSPACE}/ci/scripts/github/common.sh
 update_conda_env
 
 aws s3 cp --no-progress "${ARTIFACT_URL}/wheel.tar.bz" "${WORKSPACE_TMP}/wheel.tar.bz"
+aws s3 cp --no-progress "${ARTIFACT_URL}/cpp_tests.tar.bz" "${WORKSPACE_TMP}/cpp_tests.tar.bz"
+aws s3 cp --no-progress "${ARTIFACT_URL}/morhpeus_libs.tar.bz" "${WORKSPACE_TMP}/morhpeus_libs.tar.bz"
 
 tar xf "${WORKSPACE_TMP}/wheel.tar.bz"
+tar xf "${WORKSPACE_TMP}/morhpeus_libs.tar.bz"
+tar xf "${WORKSPACE_TMP}/cpp_tests.tar.bz"
 
 # Install the built Morpheus python package
+cd ${MORPHEUS_ROOT}
 pip install ${MORPHEUS_ROOT}/build/dist/*.whl
 
 CPP_TESTS=($(find ${MORPHEUS_ROOT}/build -name "*.x"))
 
 rapids-logger "Pulling LFS assets"
-cd ${MORPHEUS_ROOT}
 
 git lfs install
 ${MORPHEUS_ROOT}/scripts/fetch_data.py fetch tests validation
@@ -43,6 +47,12 @@ git lfs ls-files
 REPORTS_DIR="${WORKSPACE_TMP}/reports"
 mkdir -p ${WORKSPACE_TMP}/reports
 
+rapids-logger "Running C++ tests"
+# Running the tests from the tests dir. Normally this isn't nescesary, however since
+# we are testing the installed version of morpheus in site-packages and not the one
+# in the repo dir, the pytest coverage module reports incorrect coverage stats.
+pushd ${MORPHEUS_ROOT}/tests
+
 TEST_RESULTS=0
 for cpp_test in "${CPP_TESTS[@]}"; do
        test_name=$(basename ${cpp_test})
@@ -57,11 +67,6 @@ for cpp_test in "${CPP_TESTS[@]}"; do
 done
 
 rapids-logger "Running Python tests"
-# Running the tests from the tests dir. Normally this isn't nescesary, however since
-# we are testing the installed version of morpheus in site-packages and not the one
-# in the repo dir, the pytest coverage module reports incorrect coverage stats.
-cd ${MORPHEUS_ROOT}/tests
-
 set +e
 
 python -I -m pytest --run_slow --run_kafka \
@@ -74,6 +79,7 @@ PYTEST_RESULTS=$?
 TEST_RESULTS=$(($TEST_RESULTS+$PYTEST_RESULTS))
 
 set -e
+popd
 
 rapids-logger "Archiving test reports"
 cd $(dirname ${REPORTS_DIR})

diff --git a/docker/conda/environments/cuda11.8_dev.yml b/docker/conda/environments/cuda11.8_dev.yml
@@ -38,6 +38,7 @@ dependencies:
     - cupy=9.5.0
     - cython=0.29.24
     - datacompy=0.8
+    - dill
     - docker-compose=1.29.2
     - docker-py=5.0
     - faker=12.3.0
@@ -81,6 +82,7 @@ dependencies:
     - python=3.8
     - rapidjson=1.1.0
     - scikit-build=0.13
+    - scikit-learn=0.23.1
     - sphinx
     - sphinx_rtd_theme
     - sysroot_linux-64=2.17

diff --git a/docker/conda/environments/requirements.txt b/docker/conda/environments/requirements.txt
@@ -4,7 +4,6 @@
 
 ####### Pip-only runtime dependencies (keep sorted!) #######
 # Packages listed here should also be listed in setup.py
-git+/~https://github.com/nv-morpheus/dfencoder.git@branch-23.01#egg=dfencoder
 ipywidgets
 jupyter-core>=4.11.2,<5.0
 jupyterlab

diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_training.py b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_training.py
@@ -15,10 +15,10 @@
 import logging
 
 import mrc
-from dfencoder import AutoEncoder
 from mrc.core import operators as ops
 
 from morpheus.messages.multi_ae_message import MultiAEMessage
+from morpheus.models.dfencoder import AutoEncoder
 from morpheus.utils.module_ids import MODULE_NAMESPACE
 from morpheus.utils.module_utils import get_module_config
 from morpheus.utils.module_utils import register_module

diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py
@@ -21,7 +21,6 @@
 import mlflow
 import mrc
 import requests
-from dfencoder import AutoEncoder
 from mlflow.exceptions import MlflowException
 from mlflow.models.signature import ModelSignature
 from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS
@@ -36,6 +35,7 @@
 
 from morpheus.config import Config
 from morpheus.messages.multi_ae_message import MultiAEMessage
+from morpheus.models.dfencoder import AutoEncoder
 from morpheus.pipeline.single_port_stage import SinglePortStage
 from morpheus.pipeline.stream_pair import StreamPair
 

diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py
@@ -16,12 +16,12 @@
 import typing
 
 import mrc
-from dfencoder import AutoEncoder
 from mrc.core import operators as ops
 from sklearn.model_selection import train_test_split
 
 from morpheus.config import Config
 from morpheus.messages.multi_ae_message import MultiAEMessage
+from morpheus.models.dfencoder import AutoEncoder
 from morpheus.pipeline.single_port_stage import SinglePortStage
 from morpheus.pipeline.stream_pair import StreamPair
 

diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py
@@ -20,12 +20,13 @@
 from datetime import datetime
 
 import mlflow
-from dfencoder import AutoEncoder
 from mlflow.entities.model_registry import RegisteredModel
 from mlflow.exceptions import MlflowException
 from mlflow.store.entities.paged_list import PagedList
 from mlflow.tracking.client import MlflowClient
 
+from morpheus.models.dfencoder import AutoEncoder
+
 from .logging_timer import log_time
 
 logger = logging.getLogger("morpheus.{}".format(__name__))

diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py
@@ -15,13 +15,12 @@
 import logging
 import typing
 
-import numpy as np
 import pandas as pd
-import torch
-from dfencoder import AutoEncoder
 from tqdm import tqdm
 
 from morpheus.config import Config
+from morpheus.models.dfencoder import AutoEncoder
+from morpheus.utils.seed import manual_seed
 
 logger = logging.getLogger("morpheus.{}".format(__name__))
 
@@ -132,10 +131,7 @@ def train_from_batch(self, filter_func=lambda df: df):
 
         # If the seed is set, enforce that here
         if (self._seed is not None):
-            torch.manual_seed(self._seed)
-            torch.cuda.manual_seed(self._seed)
-            np.random.seed(self._seed)
-            torch.backends.cudnn.deterministic = True
+            manual_seed(self._seed)
 
         model = self._model_class(
             encoder_layers=[512, 500],  # layers of the encoding part
@@ -215,10 +211,7 @@ def train(self, df: pd.DataFrame) -> AutoEncoder:
 
         # If the seed is set, enforce that here
         if (self._seed is not None):
-            torch.manual_seed(self._seed)
-            torch.cuda.manual_seed(self._seed)
-            np.random.seed(self._seed)
-            torch.backends.cudnn.deterministic = True
+            manual_seed(self._seed)
 
         model = self._model_class(
             encoder_layers=[512, 500],  # layers of the encoding part

diff --git a/models/training-tuning-scripts/dfp-models/hammah-20211017-script.py b/models/training-tuning-scripts/dfp-models/hammah-20211017-script.py
@@ -22,10 +22,10 @@
 import argparse
 
 import dill
-import numpy as np
 import pandas as pd
 import torch
-from dfencoder import AutoEncoder
+from morpheus.models.dfencoder import AutoEncoder
+from morpheus.utils.seed import manual_seed
 
 
 def main():
@@ -84,10 +84,7 @@ def main():
     for i in list(X_train):
         if i not in list(X_val):
             X_train = X_train.drop([i], axis=1)
-    torch.manual_seed(42)
-    torch.cuda.manual_seed(42)
-    np.random.seed(42)
-    torch.backends.cudnn.deterministic = True
+    manual_seed(42)
     model = AutoEncoder(
         encoder_layers=[512, 500],  # layers of the encoding part
         decoder_layers=[512],  # layers of the decoding part

diff --git a/models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb b/models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb
diff --git a/models/training-tuning-scripts/phishing-models/phish-bert-training.ipynb b/models/training-tuning-scripts/phishing-models/phish-bert-training.ipynb
@@ -62,7 +62,9 @@
     "import requests\n",
     "import os.path\n",
     "import torch\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "\n",
+    "from morpheus.utils.seed import manual_seed"
    ]
   },
   {
@@ -227,10 +229,7 @@
    "outputs": [],
    "source": [
     "# set seeds for model reproducability\n",
-    "torch.manual_seed(random_seed)\n",
-    "torch.cuda.manual_seed(random_seed)\n",
-    "np.random.seed(random_seed)\n",
-    "torch.backends.cudnn.deterministic = True"
+    "manual_seed(random_seed)"
    ]
   },
   {

diff --git a/models/training-tuning-scripts/root-cause-models/root-cause-bert.ipynb b/models/training-tuning-scripts/root-cause-models/root-cause-bert.ipynb
@@ -73,7 +73,9 @@
     "from sklearn.model_selection import train_test_split\n",
     "import pandas as pd\n",
     "import numpy as np\n",
-    "import time"
+    "import time\n",
+    "\n",
+    "from morpheus.utils.seed import manual_seed"
    ]
   },
   {
@@ -324,10 +326,7 @@
     }
    ],
    "source": [
-    "torch.manual_seed(random_seed)\n",
-    "torch.cuda.manual_seed(random_seed)\n",
-    "np.random.seed(random_seed)\n",
-    "torch.backends.cudnn.deterministic = True\n",
+    "manual_seed(random_seed)\n",
     "seq_classifier.train_model(X_train[\"log\"], y_train,batch_size=128, epochs=1,learning_rate=3.6e-4)"
    ]
   },

diff --git a/models/training-tuning-scripts/root-cause-models/root-cause-bert.py b/models/training-tuning-scripts/root-cause-models/root-cause-bert.py
@@ -22,14 +22,13 @@
 import argparse
 import time
 
-import numpy as np
+import cudf
 import pandas as pd
-import torch
 from binary_sequence_classifier import BinarySequenceClassifier
 from sklearn.metrics import f1_score
 from sklearn.model_selection import train_test_split
 
-import cudf
+from morpheus.utils.seed import manual_seed
 
 
 def train(trainingdata, unseenerrors):
@@ -67,10 +66,7 @@ def train(trainingdata, unseenerrors):
 
     seq_classifier.init_model('bert-base-uncased')
 
-    torch.manual_seed(random_seed)
-    torch.cuda.manual_seed(random_seed)
-    np.random.seed(random_seed)
-    torch.backends.cudnn.deterministic = True
+    manual_seed(random_seed)
     seq_classifier.train_model(X_train['log'], y_train, batch_size=128, epochs=1, learning_rate=3.6e-04)
 
     timestr = time.strftime('%Y%m%d-%H%M%S')

diff --git a/morpheus/_lib/cmake/libraries/morpheus.cmake b/morpheus/_lib/cmake/libraries/morpheus.cmake
@@ -118,6 +118,12 @@ target_include_directories(morpheus
     $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
 )
 
+# We want to use RUNPATH instead of RPATH to allow LD_LIBRARY_PATH to take precedence over the paths specified in the
+# binary. This is necessary to allow ld to find the real libcuda.so instead of the stub. Eventually, this can be removed
+# once upgraded to cuda-python 12.1. Ideally, cuda-python would just load libcuda.so.1 which would take precedence over
+# libcuda.so. Relavant issue: /~https://github.com/NVIDIA/cuda-python/issues/17
+target_link_options(morpheus PUBLIC "-Wl,--enable-new-dtags")
+
 set_target_properties(morpheus
   PROPERTIES
     CXX_VISIBILITY_PRESET hidden

diff --git a/morpheus/messages/multi_ae_message.py b/morpheus/messages/multi_ae_message.py
@@ -15,10 +15,9 @@
 import dataclasses
 import logging
 
-from dfencoder import AutoEncoder
-
 from morpheus.messages.message_meta import MessageMeta
 from morpheus.messages.multi_message import MultiMessage
+from morpheus.models.dfencoder import AutoEncoder
 
 logger = logging.getLogger(__name__)
 

diff --git a/morpheus/messages/multi_inference_ae_message.py b/morpheus/messages/multi_inference_ae_message.py
@@ -15,12 +15,11 @@
 import dataclasses
 import typing
 
-from dfencoder.autoencoder import AutoEncoder
-
 from morpheus.messages.memory.tensor_memory import TensorMemory
 from morpheus.messages.message_meta import MessageMeta
 from morpheus.messages.message_meta import UserMessageMeta
 from morpheus.messages.multi_inference_message import MultiInferenceMessage
+from morpheus.models.dfencoder.autoencoder import AutoEncoder
 
 
 @dataclasses.dataclass

diff --git a/morpheus/models/__init__.py b/morpheus/models/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.