Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GPU python wheel CI #943

Merged
merged 4 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/slab.toml
Original file line number Diff line number Diff line change
Expand Up @@ -132,5 +132,5 @@ check_run_name = "Concrete Python Release (GPU)"

[command.concrete-python-test-gpu-wheel]
workflow = "concrete_python_test_gpu_wheel.yml"
profile = "gpu-test-ubuntu22"
profile = "gpu-test"
check_run_name = "Concrete Python Test GPU Wheel"
53 changes: 42 additions & 11 deletions compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ union Context {
// across multiple locations.
static const int32_t host_location = -1;
static const int32_t split_location = -2;
static const int32_t invalid_location = -3;
// Similarly dependence chunks are either indexed (which does not
// always correlate to the device index on which they are located) or
// this dependence is split further.
Expand Down Expand Up @@ -264,17 +265,40 @@ struct Dependence {
// multiple GPUs or execute concurrently on the host.
void split_dependence(size_t num_chunks, size_t num_gpu_chunks,
size_t chunk_dim, bool constant,
size_t gpu_chunk_factor) {
size_t gpu_chunk_factor, GPU_DFG *dfg) {
// If this dependence is already split, check that the split
// matches the new request
if (chunk_id == split_chunks) {
if (num_chunks + num_gpu_chunks != chunks.size())
warnx("WARNING: requesting to split dependence across different number "
"of chunks (%lu) than it already is split (%lu) which would "
"require remapping. This is not supported.",
num_chunks + num_gpu_chunks, chunks.size());
assert(num_chunks + num_gpu_chunks == chunks.size());
return;
if (num_chunks + num_gpu_chunks != chunks.size()) {
// If this is not available on host, then we need to merge on
// host and re-split
if (!onHostReady) {
size_t data_size = 0;
size_t num_samples = 0;
for (auto c : chunks) {
move_chunk_off_device(c->chunk_id, dfg);
data_size += memref_get_data_size(c->host_data);
num_samples += c->host_data.sizes[chunk_dim];
sdfg_gpu_debug_print_mref("Chunk", c->host_data);
}
host_data = chunks[0]->host_data;
host_data.allocated = host_data.aligned =
(uint64_t *)malloc(data_size);
host_data.sizes[chunk_dim] = num_samples;
size_t pos = 0;
for (auto c : chunks) {
memcpy(((char *)host_data.aligned) + pos, c->host_data.aligned,
memref_get_data_size(c->host_data));
pos += memref_get_data_size(c->host_data);
}
for (auto c : chunks)
free_chunk_host_data(c->chunk_id, dfg);
onHostReady = true;
hostAllocated = true;
}
} else {
return;
}
}
if (!chunks.empty()) {
for (auto c : chunks)
Expand Down Expand Up @@ -345,12 +369,14 @@ struct Dependence {
}
void move_chunk_off_device(int32_t chunk_id, GPU_DFG *dfg) {
copy_chunk_off_device(chunk_id, dfg);
chunks[chunk_id]->location = host_location;
if (chunks[chunk_id]->device_data == nullptr)
return;
cuda_drop_async(
chunks[chunk_id]->device_data,
(cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
chunks[chunk_id]->location);
chunks[chunk_id]->device_data = nullptr;
chunks[chunk_id]->location = host_location;
}
void merge_output_off_device(int32_t chunk_id, GPU_DFG *dfg) {
assert(chunks[chunk_id]->location > host_location);
Expand Down Expand Up @@ -381,6 +407,8 @@ struct Dependence {
(cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
chunks[chunk_id]->location);
chunks[chunk_id]->device_data = nullptr;
chunks[chunk_id]->location =
(chunks[chunk_id]->onHostReady) ? host_location : invalid_location;
}
inline void free_data(GPU_DFG *dfg, bool immediate = false) {
if (device_data != nullptr) {
Expand Down Expand Up @@ -488,6 +516,7 @@ struct Stream {
bool ct_stream;
bool pt_stream;
size_t generation;
std::atomic<size_t> uses = {0};
const char *name;
Stream(stream_type t, const char *sname = nullptr)
: dep(nullptr), type(t), producer(nullptr), dfg(nullptr),
Expand Down Expand Up @@ -527,6 +556,7 @@ struct Stream {
dep = d;
}
dep->stream_generation = generation;
uses = 0;
}
// For a given dependence, traverse the DFG backwards to extract the lattice
// of kernels required to execute to produce this data
Expand Down Expand Up @@ -689,7 +719,7 @@ struct Stream {
for (auto i : inputs)
i->dep->split_dependence(num_chunks, num_gpu_chunks,
(i->ct_stream) ? 0 : 1, i->const_stream,
gpu_chunk_factor);
gpu_chunk_factor, dfg);
for (auto iv : intermediate_values) {
if (iv->need_new_gen()) {
iv->put(new Dependence(split_location,
Expand Down Expand Up @@ -835,7 +865,8 @@ struct Stream {
}
}
for (auto i : inputs)
i->dep->free_chunk_device_data(c, dfg);
if (++i->uses == i->consumers.size())
i->dep->free_chunk_device_data(c, dfg);
for (auto iv : intermediate_values)
iv->dep->free_chunk_device_data(c, dfg);
for (auto o : outputs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,6 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
if (loopParallelize)
mlir::concretelang::dfr::_dfr_set_use_omp(true);

if (dataflowParallelize)
mlir::concretelang::dfr::_dfr_set_required(true);

// Sanity checks for enabling GPU usage: the compiler must have been
// compiled with Cuda support (especially important when building
// python wheels), and at least one device must be available to
Expand Down Expand Up @@ -340,8 +337,22 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
options.batchTFHEOps = false;
}
}

// Finally for now we cannot allow dataflow parallelization at the
// same time as GPU usage. This restriction will be relaxed later.
if (dataflowParallelize) {
warnx("Dataflow parallelization and GPU offloading have both been "
"requested. This is not currently supported. Continuing without "
"dataflow parallelization.");
dataflowParallelize = false;
}
}

// If dataflow parallelization will proceed, mark it for
// initialising the runtime
if (dataflowParallelize)
mlir::concretelang::dfr::_dfr_set_required(true);

mlir::OwningOpRef<mlir::ModuleOp> mlirModuleRef(moduleOp);
res.mlirModuleRef = std::move(mlirModuleRef);
mlir::ModuleOp module = res.mlirModuleRef->get();
Expand Down
Loading