su2code · pcarruscag · Aug 7, 2021 · Jun 1, 2021 · Jun 11, 2021 · Jun 11, 2021
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -60,7 +60,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix: 
-        testscript: ['tutorials.py', 'parallel_regression.py', 'parallel_regression_AD.py', 'serial_regression.py', 'serial_regression_AD.py', 'hybrid_regression.py']
+        testscript: ['tutorials.py', 'parallel_regression.py', 'parallel_regression_AD.py', 'serial_regression.py', 'serial_regression_AD.py', 'hybrid_regression.py', 'hybrid_regression_AD.py']
         include:
           - testscript: 'tutorials.py'
             tag: MPI
@@ -74,6 +74,8 @@ jobs:
             tag: NoMPI
           - testscript: 'hybrid_regression.py'
             tag: OMP
+          - testscript: 'hybrid_regression_AD.py'
+            tag: OMP
     steps:
       - name: Download All artifact
         uses: actions/download-artifact@v2

diff --git a/Common/include/basic_types/ad_structure.hpp b/Common/include/basic_types/ad_structure.hpp
@@ -252,7 +252,7 @@ namespace AD{
 
   /*!
    * \brief Start a passive region, i.e. stop recording.
-   * \return True is tape was active.
+   * \return True if tape was active.
    */
   inline bool BeginPassive() { return false; }
 
@@ -262,6 +262,28 @@ namespace AD{
    */
   inline void EndPassive(bool wasActive) {}
 
+  /*!
+   * \brief Pause the use of preaccumulation.
+   * \return True if preaccumulation was active.
+   */
+  inline bool PausePreaccumulation() { return false; }
+
+  /*!
+   * \brief Resume the use of preaccumulation.
+   * \param[in] wasActive - Whether preaccumulation was active before pausing.
+   */
+  inline void ResumePreaccumulation(bool wasActive) {}
+
+  /*!
+   * \brief Begin a hybrid parallel adjoint evaluation mode that assumes an inherently safe reverse path.
+   */
+  inline void StartNoSharedReading() {}
+
+  /*!
+   * \brief End the "no shared reading" adjoint evaluation mode.
+   */
+  inline void EndNoSharedReading() {}
+
 #else
   using CheckpointHandler = codi::DataStore;
 
@@ -271,9 +293,10 @@ namespace AD{
 
   extern ExtFuncHelper* FuncHelper;
 
-  extern bool Status;
-
   extern bool PreaccActive;
+#ifdef HAVE_OPDI
+  SU2_OMP(threadprivate(PreaccActive))
+#endif
 
   extern bool PreaccEnabled;
 
@@ -290,6 +313,9 @@ namespace AD{
   extern std::vector<TapePosition> TapePositions;
 
   extern codi::PreaccumulationHelper<su2double> PreaccHelper;
+#ifdef HAVE_OPDI
+  SU2_OMP(threadprivate(PreaccHelper))
+#endif
 
   /*--- Reference to the tape. ---*/
 
@@ -446,6 +472,7 @@ namespace AD{
   FORCEINLINE void EndPreacc(){
     if (PreaccActive) {
       PreaccHelper.finish(false);
+      PreaccActive = false;
     }
   }
 
@@ -522,6 +549,39 @@ namespace AD{
 
   FORCEINLINE void EndPassive(bool wasActive) { if(wasActive) StartRecording(); }
 
+  FORCEINLINE bool PausePreaccumulation() {
+    const auto current = PreaccEnabled;
+    if (!current) return false;
+    SU2_OMP_BARRIER
+    SU2_OMP_MASTER
+    PreaccEnabled = false;
+    END_SU2_OMP_MASTER
+    SU2_OMP_BARRIER
+    return true;
+  }
+
+  FORCEINLINE void ResumePreaccumulation(bool wasActive) {
+    if (!wasActive) return;
+    SU2_OMP_BARRIER
+    SU2_OMP_MASTER
+    PreaccEnabled = true;
+    END_SU2_OMP_MASTER
+    SU2_OMP_BARRIER
+  }
+
+  FORCEINLINE void StartNoSharedReading() {
+#ifdef HAVE_OPDI
+    opdi::logic->setAdjointAccessMode(opdi::LogicInterface::AdjointAccessMode::Classical);
+    opdi::logic->addReverseBarrier();
+#endif
+  }
+
+  FORCEINLINE void EndNoSharedReading() {
+#ifdef HAVE_OPDI
+    opdi::logic->setAdjointAccessMode(opdi::LogicInterface::AdjointAccessMode::Atomic);
+    opdi::logic->addReverseBarrier();
+#endif
+  }
 #endif // CODI_REVERSE_TYPE
 
 } // namespace AD

diff --git a/Common/include/code_config.hpp b/Common/include/code_config.hpp
@@ -79,25 +79,15 @@ using su2conditional_t = typename su2conditional<B,T,F>::type;
 #include "codi.hpp"
 #include "codi/tools/dataStore.hpp"
 
-#ifndef CODI_INDEX_TAPE
-#define CODI_INDEX_TAPE 0
-#endif
-#ifndef CODI_PRIMAL_TAPE
-#define CODI_PRIMAL_TAPE 0
-#endif
-#ifndef CODI_PRIMAL_INDEX_TAPE
-#define CODI_PRIMAL_INDEX_TAPE 0
-#endif
-
 #if defined(HAVE_OMP)
 using su2double = codi::RealReverseIndexParallel;
 #else
-#if CODI_INDEX_TAPE
+#if defined(CODI_INDEX_TAPE)
 using su2double = codi::RealReverseIndex;
-#elif CODI_PRIMAL_TAPE
-using su2double = codi::RealReversePrimal;
-#elif CODI_PRIMAL_INDEX_TAPE
-using su2double = codi::RealReversePrimalIndex;
+//#elif defined(CODI_PRIMAL_TAPE)
+//using su2double = codi::RealReversePrimal;
+//#elif defined(CODI_PRIMAL_INDEX_TAPE)
+//using su2double = codi::RealReversePrimalIndex;
 #else
 using su2double = codi::RealReverse;
 #endif

diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
@@ -256,6 +256,7 @@ class CSysSolve {
   void HandleTemporariesOut(CSysVector<OtherType>& LinSysSol) {
 
     /*--- Reset the pointers. ---*/
+    SU2_OMP_BARRIER
     SU2_OMP_MASTER {
       LinSysRes_ptr = nullptr;
       LinSysSol_ptr = nullptr;
@@ -276,6 +277,7 @@ class CSysSolve {
     LinSysSol.PassiveCopy(LinSysSol_tmp);
 
     /*--- Reset the pointers. ---*/
+    SU2_OMP_BARRIER
     SU2_OMP_MASTER {
       LinSysRes_ptr = nullptr;
       LinSysSol_ptr = nullptr;

diff --git a/Common/include/toolboxes/graph_toolbox.hpp b/Common/include/toolboxes/graph_toolbox.hpp
@@ -527,7 +527,7 @@ T createNaturalColoring(Index_t numInnerIndexes)
  * \param[out] indexColor - Optional, vector with colors given to the outer indices.
  * \return Coloring in the same type of the input pattern.
  */
-template<class T, typename Color_t = char, size_t MaxColors = 32, size_t MaxMB = 128>
+template<class T, typename Color_t = char, size_t MaxColors = 64, size_t MaxMB = 128>
 T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors = false,
                      std::vector<Color_t>* indexColor = nullptr)
 {

diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
@@ -4473,11 +4473,7 @@ void CConfig::SetPostprocessing(SU2_COMPONENT val_software, unsigned short val_i
 #if defined CODI_REVERSE_TYPE
   AD_Mode = YES;
 
-#if defined HAVE_OMP
-  AD::PreaccEnabled = false;
-#else
   AD::PreaccEnabled = AD_Preaccumulation;
-#endif
 
 #else
   if (AD_Mode == YES) {

diff --git a/Common/src/basic_types/ad_structure.cpp b/Common/src/basic_types/ad_structure.cpp
@@ -35,9 +35,16 @@ namespace AD {
   std::vector<TapePosition> TapePositions;
 
   bool PreaccActive = false;
+#ifdef HAVE_OPDI
+  SU2_OMP(threadprivate(PreaccActive))
+#endif
+
   bool PreaccEnabled = true;
 
   codi::PreaccumulationHelper<su2double> PreaccHelper;
+#ifdef HAVE_OPDI
+  SU2_OMP(threadprivate(PreaccHelper))
+#endif
 
   ExtFuncHelper* FuncHelper;
 

diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -7701,7 +7701,8 @@ void CPhysicalGeometry::SetBoundControlVolume(const CConfig *config, unsigned sh
 
       const auto nNodes = bound[iMarker][iElem]->GetnNodes();
 
-      AD::StartPreacc();
+      /*--- Cannot preaccumulate if hybrid parallel due to shared reading. ---*/
+      if (omp_get_num_threads() == 1) AD::StartPreacc();
 
       /*--- Get pointers to the coordinates of all the element nodes ---*/
       array<const su2double*, N_POINTS_MAXIMUM> Coord;

diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -185,10 +185,17 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
   /*--- This is akin to the row_ptr. ---*/
   omp_partitions = new unsigned long [omp_num_parts+1];
 
-  /// TODO: Use a work estimate to produce more balanced partitions.
-  auto pts_per_part = roundUpDiv(nPointDomain, omp_num_parts);
-  for(auto part = 0ul; part < omp_num_parts; ++part)
-    omp_partitions[part] = part * pts_per_part;
+  /*--- Work estimate based on non-zeros to produce balanced partitions. ---*/
+
+  const auto row_ptr_prec = ilu_needed? row_ptr_ilu : row_ptr;
+  const auto nnz_prec = row_ptr_prec[nPointDomain];
+
+  const auto nnz_per_part = roundUpDiv(nnz_prec, omp_num_parts);
+
+  for (auto iPoint = 0ul, part = 0ul; iPoint < nPointDomain; ++iPoint) {
+    if (row_ptr_prec[iPoint] >= part*nnz_per_part)
+      omp_partitions[part++] = iPoint;
+  }
   omp_partitions[omp_num_parts] = nPointDomain;
 
   /*--- Generate MKL Kernels ---*/

diff --git a/SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp b/SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp
@@ -76,7 +76,8 @@ void computeGradientsGreenGauss(CSolver* solver,
   {
     auto nodes = geometry.nodes;
 
-    AD::StartPreacc();
+    /*--- Cannot preaccumulate if hybrid parallel due to shared reading. ---*/
+    if (omp_get_num_threads() == 1) AD::StartPreacc();
     AD::SetPreaccIn(nodes->GetVolume(iPoint));
     AD::SetPreaccIn(nodes->GetPeriodicVolume(iPoint));
 

diff --git a/SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp b/SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp
@@ -203,7 +203,8 @@ void computeGradientsLeastSquares(CSolver* solver,
     auto nodes = geometry.nodes;
     const auto coord_i = nodes->GetCoord(iPoint);
 
-    AD::StartPreacc();
+    /*--- Cannot preaccumulate if hybrid parallel due to shared reading. ---*/
+    if (omp_get_num_threads() == 1) AD::StartPreacc();
     AD::SetPreaccIn(coord_i, nDim);
 
     for (size_t iVar = varBegin; iVar < varEnd; ++iVar)

diff --git a/SU2_CFD/include/limiters/computeLimiters_impl.hpp b/SU2_CFD/include/limiters/computeLimiters_impl.hpp
@@ -132,7 +132,8 @@ void computeLimiters_impl(CSolver* solver,
     auto nodes = geometry.nodes;
     const auto coord_i = nodes->GetCoord(iPoint);
 
-    AD::StartPreacc();
+    /*--- Cannot preaccumulate if hybrid parallel due to shared reading. ---*/
+    if (omp_get_num_threads() == 1) AD::StartPreacc();
     AD::SetPreaccIn(coord_i, nDim);
 
     for (size_t iVar = varBegin; iVar < varEnd; ++iVar)

diff --git a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
@@ -319,7 +319,11 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
       cout << "WARNING: On " << numRanksUsingReducer << " MPI ranks the coloring efficiency was less than "
            << COLORING_EFF_THRESH << " (min value was " << minEff << ").\n"
            << "         Those ranks will now use a fallback strategy, better performance may be possible\n"
-           << "         with a different value of config option EDGE_COLORING_GROUP_SIZE (default 512)." << endl;
+           << "         with a different value of config option EDGE_COLORING_GROUP_SIZE (default 512)."
+#ifdef HAVE_OPDI
+           << "\n         The memory usage of the discrete adjoint solver is higher when using the fallback."
+#endif
+           << endl;
     }
 
     if (config.GetUseVectorization() && (omp_get_max_threads() > 1) &&
@@ -1531,6 +1535,12 @@ void CFVMFlowSolverBase<V, R>::EdgeFluxResidual(const CGeometry *geometry,
     InstantiateEdgeNumerics(solvers, config);
   }
 
+  /*--- For hybrid parallel AD, pause preaccumulation if there is shared reading of
+  * variables, otherwise switch to the faster adjoint evaluation mode. ---*/
+  bool pausePreacc = false;
+  if (ReducerStrategy) pausePreacc = AD::PausePreaccumulation();
+  else AD::StartNoSharedReading();
+
   /*--- Loop over edge colors. ---*/
   for (auto color : EdgeColoring) {
     /*--- Chunk size is at least OMP_MIN_SIZE and a multiple of the color group size. ---*/
@@ -1553,6 +1563,10 @@ void CFVMFlowSolverBase<V, R>::EdgeFluxResidual(const CGeometry *geometry,
     END_SU2_OMP_FOR
   }
 
+  /*--- Restore preaccumulation and adjoint evaluation state. ---*/
+  AD::ResumePreaccumulation(pausePreacc);
+  if (!ReducerStrategy) AD::EndNoSharedReading();
+
   if (ReducerStrategy) {
     SumEdgeFluxes(geometry);
     if (config->GetKind_TimeIntScheme() == EULER_IMPLICIT) {
@@ -1607,6 +1621,8 @@ void CFVMFlowSolverBase<V, FlowRegime>::SetResidual_DualTime(CGeometry *geometry
 
     /*--- Loop over all nodes (excluding halos) ---*/
 
+    AD::StartNoSharedReading();
+
     SU2_OMP_FOR_STAT(omp_chunk_size)
     for (iPoint = 0; iPoint < nPointDomain; iPoint++) {
 
@@ -1642,6 +1658,8 @@ void CFVMFlowSolverBase<V, FlowRegime>::SetResidual_DualTime(CGeometry *geometry
     }
     END_SU2_OMP_FOR
 
+    AD::EndNoSharedReading();
+
   }
 
   else {
@@ -1719,6 +1737,8 @@ void CFVMFlowSolverBase<V, FlowRegime>::SetResidual_DualTime(CGeometry *geometry
     /*--- Loop over all nodes (excluding halos) to compute the remainder
      of the dual time-stepping source term. ---*/
 
+    AD::StartNoSharedReading();
+
     SU2_OMP_FOR_STAT(omp_chunk_size)
     for (iPoint = 0; iPoint < nPointDomain; iPoint++) {
 
@@ -1756,6 +1776,8 @@ void CFVMFlowSolverBase<V, FlowRegime>::SetResidual_DualTime(CGeometry *geometry
       }
     }
     END_SU2_OMP_FOR
+
+    AD::EndNoSharedReading();
   }
 
 }

diff --git a/SU2_CFD/src/SU2_CFD.cpp b/SU2_CFD/src/SU2_CFD.cpp
@@ -73,11 +73,6 @@ int main(int argc, char *argv[]) {
 #endif
   SU2_MPI::Comm MPICommunicator = SU2_MPI::GetComm();
 
-  /*--- AD initialization ---*/
-#ifdef HAVE_OPDI
-  AD::getGlobalTape().initialize();
-#endif
-
   /*--- Uncomment the following line if runtime NaN catching is desired. ---*/
   // feenableexcept(FE_INVALID | FE_OVERFLOW | FE_DIVBYZERO );
 

diff --git a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
@@ -880,6 +880,12 @@ void CDiscAdjMultizoneDriver::SetAdj_ObjFunction() {
 
 void CDiscAdjMultizoneDriver::ComputeAdjoints(unsigned short iZone, bool eval_transfer) {
 
+#if defined(CODI_INDEX_TAPE) || defined(HAVE_OPDI)
+  if (nZone > 1 && rank == MASTER_NODE) {
+    std::cout << "WARNING: Index AD types do not support multiple zones." << std::endl;
+  }
+#endif
+
   AD::ClearAdjoints();
 
   /*--- Initialize the adjoints in iZone ---*/

diff --git a/SU2_CFD/src/integration/CIntegration.cpp b/SU2_CFD/src/integration/CIntegration.cpp
@@ -76,6 +76,10 @@ void CIntegration::Space_Integration(CGeometry *geometry,
   CNumerics* conv_bound_numerics = numerics[CONV_BOUND_TERM + omp_get_thread_num()*MAX_TERMS];
   CNumerics* visc_bound_numerics = numerics[VISC_BOUND_TERM + omp_get_thread_num()*MAX_TERMS];
 
+  /*--- Pause preaccumulation in boundary conditions for hybrid parallel AD. ---*/
+  /// TODO: Check if this is really needed.
+  //const auto pausePreacc = (omp_get_num_threads() > 1) && AD::PausePreaccumulation();
+
   /*--- Boundary conditions that depend on other boundaries (they require MPI sincronization)---*/
 
   solver_container[MainSolver]->BC_Fluid_Interface(geometry, solver_container, conv_bound_numerics, visc_bound_numerics, config);
@@ -181,6 +185,8 @@ void CIntegration::Space_Integration(CGeometry *geometry,
     solver_container[MainSolver]->BC_Periodic(geometry, solver_container, conv_bound_numerics, config);
   }
 
+  //AD::ResumePreaccumulation(pausePreacc);
+
 }
 
 void CIntegration::Time_Integration(CGeometry *geometry, CSolver **solver_container, CConfig *config,