LLAMA: special handling for chat templates for gemma

Chris Warren-Smith · Chris Warren-Smith · commit 555cb1f927d5 · 2026-05-06T18:44:01.000+09:30
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -46,6 +46,7 @@ Llama::Llama() :
   _max_tokens(0),
   _log_level(GGML_LOG_LEVEL_CONT),
   _n_past(0),
+  _is_gemma4(false),
   _seed(LLAMA_DEFAULT_SEED) {
   llama_log_set([](enum ggml_log_level level, const char * text, void *user_data) {
     Llama *llama = (Llama *)user_data;
@@ -66,6 +67,7 @@ Llama::Llama(Llama &&other) noexcept
   , _grammar_src(std::move(other._grammar_src))
   , _grammar_root(std::move(other._grammar_root))
   , _last_error(std::move(other._last_error))
+  , _template(std::move(other._template))
   , _penalty_last_n(other._penalty_last_n)
   , _penalty_repeat(other._penalty_repeat)
   , _penalty_freq(other._penalty_freq)
@@ -77,6 +79,7 @@ Llama::Llama(Llama &&other) noexcept
   , _max_tokens(other._max_tokens)
   , _log_level(other._log_level)
   , _n_past(other._n_past)
+  , _is_gemma4(other._is_gemma4)
   , _seed(other._seed) {
 }
 
@@ -95,7 +98,7 @@ Llama::~Llama() {
 
 void Llama::reset() {
   _stop_sequences.clear();
-  _last_error = "";
+  _last_error.clear();
   _penalty_last_n = 64;
   _penalty_repeat = 1.1f;
   _penalty_freq = 0.0f;
@@ -106,8 +109,10 @@ void Llama::reset() {
   _min_p = 0.0f;
   _max_tokens = 150;
   _n_past = 0;
+  _is_gemma4 = false;
   _grammar_src.clear();
   _grammar_root.clear();
+  _template.clear();
   _seed = LLAMA_DEFAULT_SEED;
   if (_ctx) {
     llama_memory_clear(llama_get_memory(_ctx), true);
@@ -142,9 +147,9 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layer
       _vocab = llama_model_get_vocab(_model);
     }
     _template = llama_model_chat_template(_model, nullptr);
+    _is_gemma4 = (_template.find("<|turn>model") != string::npos);
   }
 
-
   return _last_error.empty();
 }
 
@@ -268,16 +273,34 @@ bool Llama::make_space_for_tokens(int n_tokens, int keep_min) {
 }
 
 bool Llama::add_message(LlamaIter &iter, const string &role, const string &content) {
-  llama_chat_message msg = {role.c_str(), content.c_str()};
-
+  llama_chat_message message = {role.c_str(), content.c_str()};
   int buf_size = 2 * (int)(role.size() + content.size() + 64);
   vector<char> buf(buf_size);
-  bool add_ass = (role == "user");
+  bool add_ass = (role == "user" || role == "tool");
+  int32_t n = 0;
+
+  if (_template.empty()) {
+    _last_error = "No chat template available";
+    return false;
+  }
 
-  int32_t n = llama_chat_apply_template(_template, &msg, 1, add_ass, buf.data(), buf.size());
-  if (n > (int32_t)buf.size()) {
-    buf.resize(n);
-    llama_chat_apply_template(_template, &msg, 1, add_ass, buf.data(), buf.size());
+  if (_is_gemma4) {
+    string str = "<|turn>" + role + "\n" + content + "<turn|>\n";
+    if (add_ass) {
+      str += "<|turn>model\n";
+    }
+    n = str.size();
+    buf.assign(str.begin(), str.end());
+    buf.push_back('\0');
+  } else {
+    n = llama_chat_apply_template(_template.c_str(), &message, 1, add_ass, buf.data(), buf_size);
+    if (n < 0) {
+      _last_error = "No chat template no supported";
+      return false;
+    } else if (n > (int32_t)buf.size()) {
+      buf.resize(n);
+      llama_chat_apply_template(_template.c_str(), &message, 1, add_ass, buf.data(), buf.size());
+    }
   }
   string prompt(buf.data(), n);
 
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -91,7 +91,7 @@ struct Llama {
   string _grammar_src;
   string _grammar_root;
   string _last_error;
-  const char *_template;
+  string _template;
   int32_t _penalty_last_n;
   float _penalty_repeat;
   float _penalty_freq;
@@ -103,5 +103,6 @@ struct Llama {
   int _max_tokens;
   int _log_level;
   int _n_past;
+  bool _is_gemma4;
   unsigned int _seed;
 };
diff --git a/llama/main.cpp b/llama/main.cpp
@@ -413,7 +413,7 @@ static int cmd_llama_add_message(var_s *self, int argc, slib_par_t *arg, var_s *
       int iter_id = ++g_nextId;
       LlamaIter &iter = g_llama_iter[iter_id];
       Llama &llama = g_llama.at(id);
-      auto role = get_param_str(argc, arg, 0, "");
+      auto role = get_param_str(argc, arg, 0, "user");
       auto content = get_param_str(argc, arg, 1, "");
       if (llama.add_message(iter, role, content)) {
         map_init_id(retval, iter_id, CLASS_ID_LLAMA_ITER);
@@ -423,6 +423,7 @@ static int cmd_llama_add_message(var_s *self, int argc, slib_par_t *arg, var_s *
         v_create_callback(retval, "tokens_sec", cmd_llama_tokens_sec);
         result = 1;
       } else {
+        g_llama_iter.erase(iter_id);
         error(retval, llama.last_error());
       }
     }