diff --git a/docs/parameters.md b/docs/parameters.md index d83e5497c6..fce51ad2e0 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -138,7 +138,7 @@ Task specific parameters for different tasks (text generation/image generation/e | `--max_prompt_len` | `integer` | Sets NPU specific property for maximum number of tokens in the prompt. | | `--kv_cache_precision` | `string` | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default). | | `--model_distribution_policy` | `string` | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). | -| `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3] | +| `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gemma4] | | `--tool_parser` | `string` | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, hermes3, phi4] | | `--enable_tool_guided_generation` | `bool` | Enables enforcing tool schema during generation. Requires setting response parser. Default: false. | diff --git a/src/llm/BUILD b/src/llm/BUILD index 8fe6059d71..48928adf62 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -184,6 +184,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/devstral/tool_parser.hpp", "io_processing/mistral/tool_parser.hpp", "io_processing/qwen3/reasoning_parser.hpp", + "io_processing/gemma4/reasoning_parser.hpp", "io_processing/gptoss/reasoning_parser.hpp", "io_processing/gptoss/tool_parser.hpp", "io_processing/gptoss/harmony.hpp", diff --git a/src/llm/io_processing/gemma4/reasoning_parser.hpp b/src/llm/io_processing/gemma4/reasoning_parser.hpp new file mode 100644 index 0000000000..e3aba30796 --- /dev/null +++ b/src/llm/io_processing/gemma4/reasoning_parser.hpp @@ -0,0 +1,29 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include + +#include "../qwen3/reasoning_parser.hpp" + +namespace ovms { +class Gemma4ReasoningParser : public Qwen3ReasoningParser { +public: + Gemma4ReasoningParser() = delete; + explicit Gemma4ReasoningParser(ov::genai::Tokenizer& tokenizer) : + Qwen3ReasoningParser(tokenizer, "<|channel>thought\n", "", true) {} +}; +} // namespace ovms diff --git a/src/llm/io_processing/output_parser.cpp b/src/llm/io_processing/output_parser.cpp index 1c060375df..7e95f2b2c4 100644 --- a/src/llm/io_processing/output_parser.cpp +++ b/src/llm/io_processing/output_parser.cpp @@ -28,6 +28,7 @@ #include "qwen3/reasoning_parser.hpp" #include "qwen3coder/qwen3coder_tool_parser.hpp" #include "devstral/tool_parser.hpp" +#include "gemma4/reasoning_parser.hpp" #include "gptoss/reasoning_parser.hpp" namespace ovms { @@ -177,6 +178,8 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to if (reasoningParserName == "qwen3") { reasoningParser = std::make_unique(tokenizer); + } else if (reasoningParserName == "gemma4") { + reasoningParser = std::make_unique(tokenizer); } else if (reasoningParserName == "gptoss") { reasoningParser = std::make_unique(tokenizer); } else if (!reasoningParserName.empty()) { diff --git a/src/llm/io_processing/qwen3/reasoning_parser.hpp b/src/llm/io_processing/qwen3/reasoning_parser.hpp index 6254e874e5..d7ccb9d76a 100644 --- a/src/llm/io_processing/qwen3/reasoning_parser.hpp +++ b/src/llm/io_processing/qwen3/reasoning_parser.hpp @@ -28,26 +28,41 @@ namespace ovms { class Qwen3ReasoningParser : public BaseOutputParser { protected: // Tags used to identify the reasoning segment in the content - const std::string parsingStartTag = ""; - const std::string parsingEndTag = ""; + const std::string parsingStartTag; + const std::string parsingEndTag; + const bool specialTokensRequired; + const std::vector parsingStartTags; + const std::vector specialParsingStartTags; + + Qwen3ReasoningParser(ov::genai::Tokenizer& tokenizer, + const std::string& startTag, + const std::string& endTag, + bool requiresSpecialTokens) : + BaseOutputParser(tokenizer), + parsingStartTag(startTag), + parsingEndTag(endTag), + specialTokensRequired(requiresSpecialTokens), + parsingStartTags{startTag}, + specialParsingStartTags{} {} public: Qwen3ReasoningParser() = delete; explicit Qwen3ReasoningParser(ov::genai::Tokenizer& tokenizer) : - BaseOutputParser(tokenizer) {} + Qwen3ReasoningParser(tokenizer, "", "", false) {} void parse(ParsedOutput& parsedOutput, const std::vector& generatedTokens) override; std::optional parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override; const std::vector& getParsingStartTags() const override { - static const std::vector parsingStartTags{this->parsingStartTag}; return parsingStartTags; } const std::vector& getSpecialParsingStartTags() const override { - static const std::vector specialParsingStartTags{}; return specialParsingStartTags; } const std::string& getParsingEndTag() const override { return parsingEndTag; } + bool requiresStreamingWithSpecialTokens() const override { + return specialTokensRequired; + } }; } // namespace ovms