server : fixes

ggml-ci
ggml-org · Nov 22, 2024 · 0c74590 · 0c74590
1 parent 7dc6ae5
commit 0c74590
Showing 1 changed file with 13 additions and 16 deletions.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -645,18 +645,16 @@ struct server_context {
 
         // Clear any sampling context
         for (server_slot & slot : slots) {
-            if (slot.smpl != nullptr) {
-                llama_free(slot.ctx_dft);
-                slot.ctx_dft = nullptr;
+            common_sampler_free(slot.smpl);
+            slot.smpl = nullptr;
 
-                common_speculative_free(slot.spec);
-                slot.spec = nullptr;
+            llama_free(slot.ctx_dft);
+            slot.ctx_dft = nullptr;
 
-                common_sampler_free(slot.smpl);
-                slot.smpl = nullptr;
+            common_speculative_free(slot.spec);
+            slot.spec = nullptr;
 
-                llama_batch_free(slot.batch_spec);
-            }
+            llama_batch_free(slot.batch_spec);
         }
 
         llama_batch_free(batch);
@@ -688,15 +686,9 @@ struct server_context {
 
             auto params_dft = params;
 
-            params_dft.model = params.model_draft;
+            params_dft.model        = params.model_draft;
             params_dft.n_gpu_layers = params.n_gpu_layers_draft;
 
-            if (params.draft_cpuparams.n_threads > 0) {
-                params_dft.cpuparams.n_threads = params.draft_cpuparams.n_threads;
-            }
-
-            params_dft.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
-
             common_init_result llama_init_dft = common_init_from_params(params_dft);
 
             model_dft = llama_init_dft.model;
@@ -708,10 +700,15 @@ struct server_context {
 
             if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
                 SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.model_draft.c_str(), params.model.c_str());
+
+                llama_free      (llama_init_dft.context);
+                llama_free_model(llama_init_dft.model);
+
                 return false;
             }
 
             cparams_dft = common_context_params_to_llama(params);
+            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
 
             // the context is not needed - we will create one for each slot
             llama_free(llama_init_dft.context);