Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1914,9 +1914,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len, const char* from_str,
gdv_int32 from_str_len, const char* to_str,
gdv_int32 to_str_len, gdv_int32* out_len) {
// Size the output buffer to the exact result, so large results are not capped
// by an arbitrary limit. When the replacement is no longer than the matched
// text, the result can only shrink or stay the same, so text_len is a safe
// bound and we can skip scanning. Otherwise count non-overlapping matches to
// get the exact expanded size.
gdv_int64 max_length;
if (to_str_len <= from_str_len) {
max_length = text_len;
} else {
gdv_int64 num_matches = 0;
if (from_str_len > 0 && from_str_len <= text_len) {
for (gdv_int32 i = 0; i <= text_len - from_str_len;) {
if (memcmp(text + i, from_str, from_str_len) == 0) {
num_matches++;
i += from_str_len;
} else {
i++;
}
}
}
max_length =
static_cast<gdv_int64>(text_len) + num_matches * (to_str_len - from_str_len);
}
// Gandiva variable-length output uses int32 offsets, so a single output string
// cannot exceed INT_MAX bytes. Report this explicitly instead of letting the
// cast below wrap silently.
if (max_length > INT_MAX) {
gdv_fn_context_set_error_msg(context,
"REPLACE: output string exceeds maximum size of 2GB");
*out_len = 0;
return "";
}
return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
from_str_len, to_str, to_str_len, 65535,
out_len);
from_str_len, to_str, to_str_len,
static_cast<gdv_int32>(max_length), out_len);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we even need this argument in the function signature?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thats how the function know how much memory to allocate for the output string.

}

// Returns the quoted string (Includes escape character for any single quotes)
Expand Down
56 changes: 56 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1971,6 +1971,62 @@ TEST(TestStringOps, TestReplace) {
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());

// Large output (>64 KB) must not overflow: buffer is sized to the exact result.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd feel more comfortable with few specific edge cases but I would not block the PR. Examples:

  • INT_MAX resulting size
  • 0 resulting size

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested one byte past the limit (INT_MAX + 1) rather than exactly INT_MAX, because a result of exactly INT_MAX is allowed and would force a real ~2 GB arena allocation — not something to do in a unit test. The boundary test confirms the guard triggers at precisely the right point.

std::string large_in(35000, 'X');
std::string large_expected(70000, '\0');
for (int i = 0; i < 35000; ++i) {
large_expected[2 * i] = 'X';
large_expected[2 * i + 1] = 'Y';
}
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_in.data(),
static_cast<int32_t>(large_in.size()), "X", 1, "XY", 2,
&out_len);
EXPECT_EQ(out_len, 70000);
EXPECT_EQ(std::string(out_str, out_len), large_expected);
EXPECT_FALSE(ctx.has_error());

// Large shrinking output ("XX" -> "X") on a >64 KB input.
std::string large_shrink_in(70000, 'X');
std::string large_shrink_expected(35000, 'X');
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_shrink_in.data(),
static_cast<int32_t>(large_shrink_in.size()), "XX", 2,
"X", 1, &out_len);
EXPECT_EQ(out_len, 35000);
EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected);
EXPECT_FALSE(ctx.has_error());

// Edge case: result size of exactly 0 (every byte of text is removed). Takes
// the no-scan shrink path (to_str_len <= from_str_len).
out_str = replace_utf8_utf8_utf8(ctx_ptr, "aaaa", 4, "a", 1, "", 0, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());

// Edge case: result size one past the INT_MAX boundary. 65536 single-char
// matches each expanding to 32768 bytes gives max_length = 65536 * 32768 =
// 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc).
std::string boundary_in(65536, 'a');
std::string boundary_to(32768, 'b');
replace_utf8_utf8_utf8(ctx_ptr, boundary_in.data(),
static_cast<int32_t>(boundary_in.size()), "a", 1,
boundary_to.data(), static_cast<int32_t>(boundary_to.size()),
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

// Output that would exceed INT_MAX (2GB) is reported cleanly rather than
// silently wrapping the int32 size. 50000 matches each expanding to 50000
// bytes implies max_length = 2.5e9; the guard fires before any large alloc.
std::string huge_in(50000, 'X');
std::string huge_to(50000, 'Z');
replace_utf8_utf8_utf8(ctx_ptr, huge_in.data(), static_cast<int32_t>(huge_in.size()),
"X", 1, huge_to.data(), static_cast<int32_t>(huge_to.size()),
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5,
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));
Expand Down
Loading