Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion astrbot/core/provider/sources/openai_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,22 @@ def _extract_audio_part_info(self, part: dict) -> str | None:

async def _audio_ref_to_local_path(self, audio_ref: str) -> tuple[str, list[Path]]:
cleanup_paths: list[Path] = []
if audio_ref.startswith("data:"):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Please add corresponding unit tests to verify the handling of data: URI audio references, ensuring that decoding, temp file creation, and error handling work as expected.

References
  1. New functionality, such as handling attachments, should be accompanied by corresponding unit tests.

try:
header, base64_data = audio_ref.split(",", 1)
m = re.match(r"^data:audio/(\w+);base64$", header)
if not m:
raise ValueError("Invalid audio data URI header format")
suffix = f".{m.group(1)}"
audio_bytes = base64.b64decode(base64_data)
temp_dir = Path(get_astrbot_temp_path())
temp_dir.mkdir(parents=True, exist_ok=True)
target_path = temp_dir / f"provider_audio_{uuid.uuid4().hex}{suffix}"
target_path.write_bytes(audio_bytes)
cleanup_paths.append(target_path)
Comment on lines +360 to +372

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚨 suggestion (security): Consider guarding against very large data: URIs before base64 decoding

Decoding an unbounded data: URI directly into memory can be abused for DoS via excessive memory/CPU. Consider enforcing a maximum allowed payload size (e.g., check len(m.group(2)) before base64.b64decode and reject oversized inputs) to make this path safer for untrusted input.

Suggested implementation:

    async def _audio_ref_to_local_path(self, audio_ref: str) -> tuple[str, list[Path]]:
        cleanup_paths: list[Path] = []
        if audio_ref.startswith("data:"):
            m = re.match(r"^data:audio/(\w+);base64,(.+)$", audio_ref)
            if m:
                suffix = f".{m.group(1)}"
                base64_payload = m.group(2)

                # Guard against excessively large data: URIs before decoding
                # The base64 length is ~4/3 of the decoded size; this keeps decoded audio under a safe cap.
                max_base64_length = 8 * 1024 * 1024  # ~8MB base64, ~6MB decoded
                if len(base64_payload) > max_base64_length:
                    truncated = audio_ref[:256] if len(audio_ref) > 256 else audio_ref
                    logger.warning(
                        "音频 data: URI 过大,已拒绝。长度: %d,最大允许: %d,前缀: %s",
                        len(base64_payload),
                        max_base64_length,
                        truncated,
                    )
                    raise ValueError("data: URI payload too large")

                audio_bytes = base64.b64decode(base64_payload, validate=True)
                temp_dir = Path(get_astrbot_temp_path())
                temp_dir.mkdir(parents=True, exist_ok=True)
                target_path = temp_dir / f"provider_audio_{uuid.uuid4().hex}{suffix}"
                target_path.write_bytes(audio_bytes)
                cleanup_paths.append(target_path)
                return str(target_path), cleanup_paths
        if audio_ref.startswith("http"):
  1. Ensure that logger is available in this module (it appears to be used later in this function; if not already defined/imported, a module-level logger should be configured).
  2. If you have a shared configuration or constants module for size limits, consider replacing the hard-coded max_base64_length with a named constant imported from there to keep limits consistent across the codebase.

return str(target_path), cleanup_paths
Comment thread
Foolllll-J marked this conversation as resolved.
except Exception as e:
raise ValueError(f"Failed to decode audio data URI: {e}") from e
if audio_ref.startswith("http"):
suffix = Path(urlparse(audio_ref).path).suffix or ".wav"
temp_dir = Path(get_astrbot_temp_path())
Expand Down Expand Up @@ -384,7 +400,8 @@ async def _resolve_audio_part(self, audio_ref: str) -> dict | None:
audio_format = "wav"
audio_bytes = Path(audio_path).read_bytes()
except Exception as exc:
logger.warning("音频 %s 预处理失败,将忽略。错误: %s", audio_ref, exc)
truncated = audio_ref[:256] if len(audio_ref) > 256 else audio_ref
logger.warning("音频 %s 预处理失败,将忽略。错误: %s", truncated, exc)
return None
finally:
for cleanup_path in cleanup_paths:
Expand Down
Loading