openvinotoolkit · dkalinowski · May 7, 2026
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -138,7 +138,7 @@ Task specific parameters for different tasks (text generation/image generation/e
 | `--max_prompt_len`                    | `integer`    | Sets NPU specific property for maximum number of tokens in the prompt.                                                     |
 | `--kv_cache_precision`                | `string`     | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default).            |
 | `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
-| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3]                     |
+| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gemma4]             |
 | `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, hermes3, phi4]            |
 | `--enable_tool_guided_generation`     | `bool`       | Enables enforcing tool schema during generation. Requires setting response parser. Default: false.                         |
 

diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -184,6 +184,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/devstral/tool_parser.hpp",
             "io_processing/mistral/tool_parser.hpp",
             "io_processing/qwen3/reasoning_parser.hpp",
+            "io_processing/gemma4/reasoning_parser.hpp",
             "io_processing/gptoss/reasoning_parser.hpp",
             "io_processing/gptoss/tool_parser.hpp",
             "io_processing/gptoss/harmony.hpp",

diff --git a/src/llm/io_processing/gemma4/reasoning_parser.hpp b/src/llm/io_processing/gemma4/reasoning_parser.hpp
@@ -0,0 +1,29 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <openvino/genai/tokenizer.hpp>
+
+#include "../qwen3/reasoning_parser.hpp"
+
+namespace ovms {
+class Gemma4ReasoningParser : public Qwen3ReasoningParser {
+public:
+    Gemma4ReasoningParser() = delete;
+    explicit Gemma4ReasoningParser(ov::genai::Tokenizer& tokenizer) :
+        Qwen3ReasoningParser(tokenizer, "<|channel>thought\n", "<channel|>", true) {}
+};
+}  // namespace ovms
diff --git a/src/llm/io_processing/output_parser.cpp b/src/llm/io_processing/output_parser.cpp
@@ -28,6 +28,7 @@
 #include "qwen3/reasoning_parser.hpp"
 #include "qwen3coder/qwen3coder_tool_parser.hpp"
 #include "devstral/tool_parser.hpp"
+#include "gemma4/reasoning_parser.hpp"
 #include "gptoss/reasoning_parser.hpp"
 
 namespace ovms {
@@ -177,6 +178,8 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to
 
     if (reasoningParserName == "qwen3") {
         reasoningParser = std::make_unique<Qwen3ReasoningParser>(tokenizer);
+    } else if (reasoningParserName == "gemma4") {
+        reasoningParser = std::make_unique<Gemma4ReasoningParser>(tokenizer);
     } else if (reasoningParserName == "gptoss") {
         reasoningParser = std::make_unique<GptOssReasoningParser>(tokenizer);
     } else if (!reasoningParserName.empty()) {

diff --git a/src/llm/io_processing/qwen3/reasoning_parser.hpp b/src/llm/io_processing/qwen3/reasoning_parser.hpp
@@ -28,26 +28,41 @@ namespace ovms {
 class Qwen3ReasoningParser : public BaseOutputParser {
 protected:
     // Tags used to identify the reasoning segment in the content
-    const std::string parsingStartTag = "<think>";
-    const std::string parsingEndTag = "</think>";
+    const std::string parsingStartTag;
+    const std::string parsingEndTag;
+    const bool specialTokensRequired;
+    const std::vector<std::string> parsingStartTags;
+    const std::vector<std::string> specialParsingStartTags;
+
+    Qwen3ReasoningParser(ov::genai::Tokenizer& tokenizer,
+        const std::string& startTag,
+        const std::string& endTag,
+        bool requiresSpecialTokens) :
+        BaseOutputParser(tokenizer),
+        parsingStartTag(startTag),
+        parsingEndTag(endTag),
+        specialTokensRequired(requiresSpecialTokens),
+        parsingStartTags{startTag},
+        specialParsingStartTags{} {}
 
 public:
     Qwen3ReasoningParser() = delete;
     explicit Qwen3ReasoningParser(ov::genai::Tokenizer& tokenizer) :
-        BaseOutputParser(tokenizer) {}
+        Qwen3ReasoningParser(tokenizer, "<think>", "</think>", false) {}
 
     void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
     std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
     const std::vector<std::string>& getParsingStartTags() const override {
-        static const std::vector<std::string> parsingStartTags{this->parsingStartTag};
         return parsingStartTags;
     }
     const std::vector<std::string>& getSpecialParsingStartTags() const override {
-        static const std::vector<std::string> specialParsingStartTags{};
         return specialParsingStartTags;
     }
     const std::string& getParsingEndTag() const override {
         return parsingEndTag;
     }
+    bool requiresStreamingWithSpecialTokens() const override {
+        return specialTokensRequired;
+    }
 };
 }  // namespace ovms