zama-ai · antoniupop · Jul 23, 2024 · Jul 17, 2024 · Jul 19, 2024 · Jul 19, 2024
diff --git a/ci/slab.toml b/ci/slab.toml
@@ -132,5 +132,5 @@ check_run_name = "Concrete Python Release (GPU)"
 
 [command.concrete-python-test-gpu-wheel]
 workflow = "concrete_python_test_gpu_wheel.yml"
-profile = "gpu-test-ubuntu22"
+profile = "gpu-test"
 check_run_name = "Concrete Python Test GPU Wheel"
diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -94,6 +94,7 @@ union Context {
 // across multiple locations.
 static const int32_t host_location = -1;
 static const int32_t split_location = -2;
+static const int32_t invalid_location = -3;
 // Similarly dependence chunks are either indexed (which does not
 // always correlate to the device index on which they are located) or
 // this dependence is split further.
@@ -264,17 +265,40 @@ struct Dependence {
   // multiple GPUs or execute concurrently on the host.
   void split_dependence(size_t num_chunks, size_t num_gpu_chunks,
                         size_t chunk_dim, bool constant,
-                        size_t gpu_chunk_factor) {
+                        size_t gpu_chunk_factor, GPU_DFG *dfg) {
     // If this dependence is already split, check that the split
     // matches the new request
     if (chunk_id == split_chunks) {
-      if (num_chunks + num_gpu_chunks != chunks.size())
-        warnx("WARNING: requesting to split dependence across different number "
-              "of chunks (%lu) than it already is split (%lu) which would "
-              "require remapping. This is not supported.",
-              num_chunks + num_gpu_chunks, chunks.size());
-      assert(num_chunks + num_gpu_chunks == chunks.size());
-      return;
+      if (num_chunks + num_gpu_chunks != chunks.size()) {
+        // If this is not available on host, then we need to merge on
+        // host and re-split
+        if (!onHostReady) {
+          size_t data_size = 0;
+          size_t num_samples = 0;
+          for (auto c : chunks) {
+            move_chunk_off_device(c->chunk_id, dfg);
+            data_size += memref_get_data_size(c->host_data);
+            num_samples += c->host_data.sizes[chunk_dim];
+            sdfg_gpu_debug_print_mref("Chunk", c->host_data);
+          }
+          host_data = chunks[0]->host_data;
+          host_data.allocated = host_data.aligned =
+              (uint64_t *)malloc(data_size);
+          host_data.sizes[chunk_dim] = num_samples;
+          size_t pos = 0;
+          for (auto c : chunks) {
+            memcpy(((char *)host_data.aligned) + pos, c->host_data.aligned,
+                   memref_get_data_size(c->host_data));
+            pos += memref_get_data_size(c->host_data);
+          }
+          for (auto c : chunks)
+            free_chunk_host_data(c->chunk_id, dfg);
+          onHostReady = true;
+          hostAllocated = true;
+        }
+      } else {
+        return;
+      }
     }
     if (!chunks.empty()) {
       for (auto c : chunks)
@@ -345,12 +369,14 @@ struct Dependence {
   }
   void move_chunk_off_device(int32_t chunk_id, GPU_DFG *dfg) {
     copy_chunk_off_device(chunk_id, dfg);
+    chunks[chunk_id]->location = host_location;
+    if (chunks[chunk_id]->device_data == nullptr)
+      return;
     cuda_drop_async(
         chunks[chunk_id]->device_data,
         (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
         chunks[chunk_id]->location);
     chunks[chunk_id]->device_data = nullptr;
-    chunks[chunk_id]->location = host_location;
   }
   void merge_output_off_device(int32_t chunk_id, GPU_DFG *dfg) {
     assert(chunks[chunk_id]->location > host_location);
@@ -381,6 +407,8 @@ struct Dependence {
         (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
         chunks[chunk_id]->location);
     chunks[chunk_id]->device_data = nullptr;
+    chunks[chunk_id]->location =
+        (chunks[chunk_id]->onHostReady) ? host_location : invalid_location;
   }
   inline void free_data(GPU_DFG *dfg, bool immediate = false) {
     if (device_data != nullptr) {
@@ -488,6 +516,7 @@ struct Stream {
   bool ct_stream;
   bool pt_stream;
   size_t generation;
+  std::atomic<size_t> uses = {0};
   const char *name;
   Stream(stream_type t, const char *sname = nullptr)
       : dep(nullptr), type(t), producer(nullptr), dfg(nullptr),
@@ -527,6 +556,7 @@ struct Stream {
       dep = d;
     }
     dep->stream_generation = generation;
+    uses = 0;
   }
   // For a given dependence, traverse the DFG backwards to extract the lattice
   // of kernels required to execute to produce this data
@@ -689,7 +719,7 @@ struct Stream {
     for (auto i : inputs)
       i->dep->split_dependence(num_chunks, num_gpu_chunks,
                                (i->ct_stream) ? 0 : 1, i->const_stream,
-                               gpu_chunk_factor);
+                               gpu_chunk_factor, dfg);
     for (auto iv : intermediate_values) {
       if (iv->need_new_gen()) {
         iv->put(new Dependence(split_location,
@@ -835,7 +865,8 @@ struct Stream {
                 }
               }
               for (auto i : inputs)
-                i->dep->free_chunk_device_data(c, dfg);
+                if (++i->uses == i->consumers.size())
+                  i->dep->free_chunk_device_data(c, dfg);
               for (auto iv : intermediate_values)
                 iv->dep->free_chunk_device_data(c, dfg);
               for (auto o : outputs)

diff --git a/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp b/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
@@ -300,9 +300,6 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
   if (loopParallelize)
     mlir::concretelang::dfr::_dfr_set_use_omp(true);
 
-  if (dataflowParallelize)
-    mlir::concretelang::dfr::_dfr_set_required(true);
-
   // Sanity checks for enabling GPU usage: the compiler must have been
   // compiled with Cuda support (especially important when building
   // python wheels), and at least one device must be available to
@@ -340,8 +337,22 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
         options.batchTFHEOps = false;
       }
     }
+
+    // Finally for now we cannot allow dataflow parallelization at the
+    // same time as GPU usage.  This restriction will be relaxed later.
+    if (dataflowParallelize) {
+      warnx("Dataflow parallelization and GPU offloading have both been "
+            "requested.  This is not currently supported.  Continuing without "
+            "dataflow parallelization.");
+      dataflowParallelize = false;
+    }
   }
 
+  // If dataflow parallelization will proceed, mark it for
+  // initialising the runtime
+  if (dataflowParallelize)
+    mlir::concretelang::dfr::_dfr_set_required(true);
+
   mlir::OwningOpRef<mlir::ModuleOp> mlirModuleRef(moduleOp);
   res.mlirModuleRef = std::move(mlirModuleRef);
   mlir::ModuleOp module = res.mlirModuleRef->get();