Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions openevolve/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ class LLMModelConfig:
# Reasoning parameters
reasoning_effort: Optional[str] = None

# Reasoning model override: True forces reasoning-model parameter conventions
# (max_completion_tokens, no temperature/top_p), False forces standard conventions,
# None (default) auto-detects based on known OpenAI reasoning model prefixes.
is_reasoning_model: Optional[bool] = None

# Manual mode (human-in-the-loop)
manual_mode: Optional[bool] = None
_manual_queue_dir: Optional[str] = None
Expand Down Expand Up @@ -167,6 +172,8 @@ def __post_init__(self):
self.evaluator_models = self.models.copy()

# Update models with shared configuration values
# Note: is_reasoning_model is intentionally excluded from shared_config.
# It is a per-model override, not a shared default.
shared_config = {
"api_base": self.api_base,
"api_key": self.api_key,
Expand Down Expand Up @@ -221,6 +228,8 @@ def rebuild_models(self) -> None:
self.evaluator_models = self.models.copy()

# Update models with shared configuration values
# Note: is_reasoning_model is intentionally excluded from shared_config.
# It is a per-model override, not a shared default.
shared_config = {
"api_base": self.api_base,
"api_key": self.api_key,
Expand Down
61 changes: 38 additions & 23 deletions openevolve/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,39 @@

logger = logging.getLogger(__name__)

# OpenAI reasoning models that require max_completion_tokens instead of max_tokens.
# These models don't support temperature/top_p and use different parameters.
OPENAI_REASONING_MODEL_PREFIXES: tuple[str, ...] = (
# O-series reasoning models (o1, o1-mini, o1-preview, o3, o3-mini, o3-pro, o4-mini, etc.)
"o1",
"o3",
"o4-",
# GPT-5 series (gpt-5, gpt-5-mini, gpt-5-nano, etc.)
"gpt-5",
# GPT OSS series (gpt-oss-120b, gpt-oss-20b, etc.)
"gpt-oss-",
)


def is_reasoning_model(
model_name: str,
config_flag: Optional[bool] = None,
) -> bool:
"""Detect if a model should be treated as a reasoning model.

Args:
model_name: The model name/identifier.
config_flag: Explicit override from config. If True/False, returns that
value directly. If None (default), auto-detects based on known
OpenAI reasoning model prefixes.

Returns:
True if the model should be treated as a reasoning model.
"""
if config_flag is not None:
return config_flag
return model_name.lower().startswith(OPENAI_REASONING_MODEL_PREFIXES)


def _iso_now() -> str:
return datetime.now(tz=timezone.utc).isoformat()
Expand Down Expand Up @@ -63,9 +96,10 @@ def __init__(
self.api_key = model_cfg.api_key
self.random_seed = getattr(model_cfg, "random_seed", None)
self.reasoning_effort = getattr(model_cfg, "reasoning_effort", None)
self.is_reasoning_model_flag = getattr(model_cfg, "is_reasoning_model", None)

# Manual mode: enabled via llm.manual_mode in config.yaml
self.manual_mode = (getattr(model_cfg, "manual_mode", False) is True)
self.manual_mode = getattr(model_cfg, "manual_mode", False) is True
self.manual_queue_dir: Optional[Path] = None

if self.manual_mode:
Expand Down Expand Up @@ -114,29 +148,10 @@ async def generate_with_context(
formatted_messages.extend(messages)

# Set up generation parameters
# Define OpenAI reasoning models that require max_completion_tokens
# These models don't support temperature/top_p and use different parameters
OPENAI_REASONING_MODEL_PREFIXES = (
# O-series reasoning models
"o1-",
"o1", # o1, o1-mini, o1-preview
"o3-",
"o3", # o3, o3-mini, o3-pro
"o4-", # o4-mini
# GPT-5 series are also reasoning models
"gpt-5-",
"gpt-5", # gpt-5, gpt-5-mini, gpt-5-nano
# The GPT OSS series are also reasoning models
"gpt-oss-120b",
"gpt-oss-20b",
)

# Check if this is an OpenAI reasoning model based on model name pattern
# This works for all endpoints (OpenAI, Azure, OptiLLM, OpenRouter, etc.)
model_lower = str(self.model).lower()
is_openai_reasoning_model = model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
# Detect whether to use reasoning-model parameter conventions
is_reasoning = is_reasoning_model(self.model, self.is_reasoning_model_flag)

if is_openai_reasoning_model:
if is_reasoning:
# For OpenAI reasoning models
params = {
"model": self.model,
Expand Down
81 changes: 31 additions & 50 deletions tests/test_openai_model_detection.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,22 @@
"""
Test OpenAI reasoning model detection logic

Updated to use the extracted is_reasoning_model() function instead of
duplicating detection logic locally.
"""

import unittest
from unittest.mock import MagicMock

from openevolve.llm.openai import OPENAI_REASONING_MODEL_PREFIXES, is_reasoning_model


class TestOpenAIReasoningModelDetection(unittest.TestCase):
"""Test that OpenAI reasoning models are correctly identified"""
"""Test that OpenAI reasoning models are correctly identified via auto-detection"""

def test_reasoning_model_detection(self):
"""Test various model names to ensure correct reasoning model detection"""

# Define the same constants as in the code
OPENAI_REASONING_MODEL_PREFIXES = (
# O-series reasoning models
"o1-",
"o1", # o1, o1-mini, o1-preview
"o3-",
"o3", # o3, o3-mini, o3-pro
"o4-", # o4-mini
# GPT-5 series are also reasoning models
"gpt-5-",
"gpt-5", # gpt-5, gpt-5-mini, gpt-5-nano
)

def is_reasoning_model(model_name, api_base="https://api.openai.com/v1"):
"""Test function that mimics the logic in openai.py"""
model_lower = str(model_name).lower()
return api_base == "https://api.openai.com/v1" and model_lower.startswith(
OPENAI_REASONING_MODEL_PREFIXES
)

# Test cases: (model_name, expected_result, description)
test_cases = [
# Reasoning models - should return True
# Reasoning models - should return True (auto-detect)
("o1", True, "Base o1 model"),
("o1-mini", True, "o1-mini model"),
("o1-preview", True, "o1-preview model"),
Expand All @@ -46,14 +28,16 @@ def is_reasoning_model(model_name, api_base="https://api.openai.com/v1"):
("gpt-5", True, "Base gpt-5 model"),
("gpt-5-mini", True, "gpt-5-mini model"),
("gpt-5-nano", True, "gpt-5-nano model"),
# Non-reasoning models - should return False
("gpt-oss-120b", True, "gpt-oss-120b model"),
("gpt-oss-20b", True, "gpt-oss-20b model"),
# Non-reasoning models - should return False (auto-detect)
("gpt-4o-mini", False, "gpt-4o-mini (not reasoning)"),
("gpt-4o", False, "gpt-4o (not reasoning)"),
("gpt-4", False, "gpt-4 (not reasoning)"),
("gpt-3.5-turbo", False, "gpt-3.5-turbo (not reasoning)"),
("claude-3", False, "Non-OpenAI model"),
("gemini-pro", False, "Non-OpenAI model"),
# Edge cases
# Case insensitivity
("O1-MINI", True, "Uppercase o1-mini"),
("GPT-5-MINI", True, "Uppercase gpt-5-mini"),
]
Expand All @@ -67,32 +51,29 @@ def is_reasoning_model(model_name, api_base="https://api.openai.com/v1"):
f"Model '{model_name}' ({description}): expected {expected}, got {result}",
)

def test_non_openai_api_base(self):
"""Test that non-OpenAI API bases don't trigger reasoning model logic"""
OPENAI_REASONING_MODEL_PREFIXES = ("o1-", "o1", "o3-", "o3", "o4-", "gpt-5-", "gpt-5")

def is_reasoning_model(model_name, api_base):
model_lower = str(model_name).lower()
return api_base == "https://api.openai.com/v1" and model_lower.startswith(
OPENAI_REASONING_MODEL_PREFIXES
)

# Even reasoning model names should return False for non-OpenAI APIs
test_cases = [
("o1-mini", "https://api.anthropic.com/v1", False),
("gpt-5", "https://generativelanguage.googleapis.com/v1beta/openai/", False),
("o3-mini", "https://api.deepseek.com/v1", False),
def test_non_openai_models_not_auto_detected(self):
"""Non-OpenAI models should not be auto-detected as reasoning models"""
non_openai_models = [
"gemini-2.5-pro",
"gemini-2.5-flash",
"claude-sonnet-4-5-20250929",
"claude-opus-4-5-20251101",
"deepseek-r1",
]

for model_name, api_base, expected in test_cases:
with self.subTest(model=model_name, api=api_base):
result = is_reasoning_model(model_name, api_base)
self.assertEqual(
result,
expected,
f"Model '{model_name}' with API '{api_base}' should return {expected}",
for model_name in non_openai_models:
with self.subTest(model=model_name):
self.assertFalse(
is_reasoning_model(model_name),
f"Non-OpenAI model '{model_name}' should not be auto-detected",
)

def test_explicit_override_ignores_api_base(self):
"""Explicit config_flag overrides auto-detection regardless of model origin"""
# Even non-OpenAI models can be forced to reasoning mode
self.assertTrue(is_reasoning_model("gemini-2.5-flash", config_flag=True))
# Even OpenAI reasoning models can be forced to standard mode
self.assertFalse(is_reasoning_model("o3-mini", config_flag=False))


if __name__ == "__main__":
unittest.main()
84 changes: 84 additions & 0 deletions tests/test_reasoning_model_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Tests for reasoning model detection logic (is_reasoning_model function)"""

import unittest

from openevolve.llm.openai import OPENAI_REASONING_MODEL_PREFIXES, is_reasoning_model


class TestIsReasoningModel(unittest.TestCase):
"""Test the is_reasoning_model() function"""

# Auto-detect (config_flag=None) -- OpenAI models
def test_openai_o_series_auto_detected(self):
for model in ["o1", "o1-mini", "o3", "o3-mini", "o3-pro", "o4-mini"]:
with self.subTest(model=model):
self.assertTrue(is_reasoning_model(model))

def test_openai_gpt5_auto_detected(self):
for model in ["gpt-5", "gpt-5-mini", "gpt-5-nano"]:
with self.subTest(model=model):
self.assertTrue(is_reasoning_model(model))

def test_openai_gpt_oss_auto_detected(self):
for model in ["gpt-oss-120b", "gpt-oss-20b", "gpt-oss-30b"]:
with self.subTest(model=model):
self.assertTrue(is_reasoning_model(model))

def test_openai_non_reasoning_not_detected(self):
for model in ["gpt-4o", "gpt-4o-mini", "gpt-4", "gpt-3.5-turbo"]:
with self.subTest(model=model):
self.assertFalse(is_reasoning_model(model))

# Auto-detect -- Non-OpenAI models should NOT be auto-detected
def test_gemini_not_auto_detected(self):
for model in ["gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite"]:
with self.subTest(model=model):
self.assertFalse(is_reasoning_model(model))

def test_claude_not_auto_detected(self):
for model in ["claude-sonnet-4-5-20250929", "claude-opus-4-5-20251101"]:
with self.subTest(model=model):
self.assertFalse(is_reasoning_model(model))

def test_deepseek_not_auto_detected(self):
self.assertFalse(is_reasoning_model("deepseek-r1"))

# Explicit config_flag=True -- forces reasoning model
def test_explicit_true_overrides_auto_detect(self):
self.assertTrue(is_reasoning_model("gemini-2.5-flash", config_flag=True))
self.assertTrue(is_reasoning_model("deepseek-r1", config_flag=True))
self.assertTrue(is_reasoning_model("any-unknown-model", config_flag=True))

# Explicit config_flag=False -- forces non-reasoning model
def test_explicit_false_overrides_auto_detect(self):
# Even OpenAI reasoning models can be forced to non-reasoning
self.assertFalse(is_reasoning_model("o3-mini", config_flag=False))
self.assertFalse(is_reasoning_model("gpt-5", config_flag=False))

# Case insensitivity
def test_case_insensitive(self):
self.assertTrue(is_reasoning_model("O3-MINI"))
self.assertTrue(is_reasoning_model("GPT-5-MINI"))

# Backward compatibility
def test_none_config_flag_is_default(self):
"""None config_flag should behave exactly like the old hardcoded logic"""
self.assertTrue(is_reasoning_model("o3-mini", config_flag=None))
self.assertFalse(is_reasoning_model("gpt-4o", config_flag=None))


class TestReasoningModelPrefixes(unittest.TestCase):
"""Test that the prefix constant is properly defined"""

def test_prefixes_is_tuple(self):
self.assertIsInstance(OPENAI_REASONING_MODEL_PREFIXES, tuple)

def test_prefixes_contains_o_series(self):
# At minimum, o1 and o3 should be in the prefixes
prefixes_str = " ".join(OPENAI_REASONING_MODEL_PREFIXES)
self.assertIn("o1", prefixes_str)
self.assertIn("o3", prefixes_str)


if __name__ == "__main__":
unittest.main()
Loading