diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 9a2d8c8415c..089eeaea0d5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1914,9 +1914,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, const char* to_str, gdv_int32 to_str_len, gdv_int32* out_len) { + // Size the output buffer to the exact result, so large results are not capped + // by an arbitrary limit. When the replacement is no longer than the matched + // text, the result can only shrink or stay the same, so text_len is a safe + // bound and we can skip scanning. Otherwise count non-overlapping matches to + // get the exact expanded size. + gdv_int64 max_length; + if (to_str_len <= from_str_len) { + max_length = text_len; + } else { + gdv_int64 num_matches = 0; + if (from_str_len > 0 && from_str_len <= text_len) { + for (gdv_int32 i = 0; i <= text_len - from_str_len;) { + if (memcmp(text + i, from_str, from_str_len) == 0) { + num_matches++; + i += from_str_len; + } else { + i++; + } + } + } + max_length = + static_cast(text_len) + num_matches * (to_str_len - from_str_len); + } + // Gandiva variable-length output uses int32 offsets, so a single output string + // cannot exceed INT_MAX bytes. Report this explicitly instead of letting the + // cast below wrap silently. + if (max_length > INT_MAX) { + gdv_fn_context_set_error_msg(context, + "REPLACE: output string exceeds maximum size of 2GB"); + *out_len = 0; + return ""; + } return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str, - from_str_len, to_str, to_str_len, 65535, - out_len); + from_str_len, to_str, to_str_len, + static_cast(max_length), out_len); } // Returns the quoted string (Includes escape character for any single quotes) diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index a31683c65ac..9b7d9c6d9d1 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1971,6 +1971,62 @@ TEST(TestStringOps, TestReplace) { EXPECT_EQ(std::string(out_str, out_len), "TestString"); EXPECT_FALSE(ctx.has_error()); + // Large output (>64 KB) must not overflow: buffer is sized to the exact result. + std::string large_in(35000, 'X'); + std::string large_expected(70000, '\0'); + for (int i = 0; i < 35000; ++i) { + large_expected[2 * i] = 'X'; + large_expected[2 * i + 1] = 'Y'; + } + out_str = replace_utf8_utf8_utf8(ctx_ptr, large_in.data(), + static_cast(large_in.size()), "X", 1, "XY", 2, + &out_len); + EXPECT_EQ(out_len, 70000); + EXPECT_EQ(std::string(out_str, out_len), large_expected); + EXPECT_FALSE(ctx.has_error()); + + // Large shrinking output ("XX" -> "X") on a >64 KB input. + std::string large_shrink_in(70000, 'X'); + std::string large_shrink_expected(35000, 'X'); + out_str = replace_utf8_utf8_utf8(ctx_ptr, large_shrink_in.data(), + static_cast(large_shrink_in.size()), "XX", 2, + "X", 1, &out_len); + EXPECT_EQ(out_len, 35000); + EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected); + EXPECT_FALSE(ctx.has_error()); + + // Edge case: result size of exactly 0 (every byte of text is removed). Takes + // the no-scan shrink path (to_str_len <= from_str_len). + out_str = replace_utf8_utf8_utf8(ctx_ptr, "aaaa", 4, "a", 1, "", 0, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + // Edge case: result size one past the INT_MAX boundary. 65536 single-char + // matches each expanding to 32768 bytes gives max_length = 65536 * 32768 = + // 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc). + std::string boundary_in(65536, 'a'); + std::string boundary_to(32768, 'b'); + replace_utf8_utf8_utf8(ctx_ptr, boundary_in.data(), + static_cast(boundary_in.size()), "a", 1, + boundary_to.data(), static_cast(boundary_to.size()), + &out_len); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size")); + EXPECT_EQ(out_len, 0); + ctx.Reset(); + + // Output that would exceed INT_MAX (2GB) is reported cleanly rather than + // silently wrapping the int32 size. 50000 matches each expanding to 50000 + // bytes implies max_length = 2.5e9; the guard fires before any large alloc. + std::string huge_in(50000, 'X'); + std::string huge_to(50000, 'Z'); + replace_utf8_utf8_utf8(ctx_ptr, huge_in.data(), static_cast(huge_in.size()), + "X", 1, huge_to.data(), static_cast(huge_to.size()), + &out_len); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size")); + EXPECT_EQ(out_len, 0); + ctx.Reset(); + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5, &out_len); EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));