Skip to content

Commit 4ea27dc

Browse files
author
Jordan Epstein
committed
GH-1179: Fix VectorAppender data size computation for variable-width vectors with non-zero start offsets
VectorAppender computed the delta vector's data size as its last offset value, which is only correct when the offset buffer starts at zero. Vectors imported through the C data interface from sliced arrays can have a non-zero first offset; appending them copied the unreferenced data buffer prefix into the target, inflating it on every append until allocation eventually failed with OversizedAllocationException. Compute the data size as the distance between the first and last offsets, copy from the first offset, and rebase appended offsets accordingly. Fixes #1179.
1 parent 485f813 commit 4ea27dc

2 files changed

Lines changed: 100 additions & 11 deletions

File tree

vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,15 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) {
125125
targetVector
126126
.getOffsetBuffer()
127127
.getInt((long) targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
128+
// The delta vector's offset buffer need not start at zero (e.g. a vector imported through
129+
// the C data interface from a sliced array), so the amount of data to append is the
130+
// distance between its first and last offsets, not the last offset itself.
131+
int deltaDataStart = deltaVector.getOffsetBuffer().getInt(0);
128132
int deltaDataSize =
129133
deltaVector
130-
.getOffsetBuffer()
131-
.getInt((long) deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
134+
.getOffsetBuffer()
135+
.getInt((long) deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH)
136+
- deltaDataStart;
132137
int newValueCapacity = targetDataSize + deltaDataSize;
133138

134139
// make sure there is enough capacity
@@ -149,7 +154,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) {
149154

150155
// append data buffer
151156
MemoryUtil.copyMemory(
152-
deltaVector.getDataBuffer().memoryAddress(),
157+
deltaVector.getDataBuffer().memoryAddress() + deltaDataStart,
153158
targetVector.getDataBuffer().memoryAddress() + targetDataSize,
154159
deltaDataSize);
155160

@@ -160,7 +165,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) {
160165
+ (targetVector.getValueCount() + 1) * BaseVariableWidthVector.OFFSET_WIDTH,
161166
deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
162167

163-
// increase each offset from the second buffer
168+
// rebase each appended offset to the target's data, accounting for the delta's start offset
164169
for (int i = 0; i < deltaVector.getValueCount(); i++) {
165170
int oldOffset =
166171
targetVector
@@ -172,7 +177,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) {
172177
.getOffsetBuffer()
173178
.setInt(
174179
(long) (targetVector.getValueCount() + 1 + i) * BaseVariableWidthVector.OFFSET_WIDTH,
175-
oldOffset + targetDataSize);
180+
oldOffset - deltaDataStart + targetDataSize);
176181
}
177182
((BaseVariableWidthVector) targetVector).setLastSet(newValueCount - 1);
178183
targetVector.setValueCount(newValueCount);
@@ -196,11 +201,15 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) {
196201
.getOffsetBuffer()
197202
.getLong(
198203
(long) targetVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH);
204+
// see the corresponding comment in visit(BaseVariableWidthVector, Void): the delta's
205+
// offset buffer need not start at zero
206+
long deltaDataStart = deltaVector.getOffsetBuffer().getLong(0);
199207
long deltaDataSize =
200208
deltaVector
201-
.getOffsetBuffer()
202-
.getLong(
203-
(long) deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH);
209+
.getOffsetBuffer()
210+
.getLong(
211+
(long) deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH)
212+
- deltaDataStart;
204213
long newValueCapacity = targetDataSize + deltaDataSize;
205214

206215
// make sure there is enough capacity
@@ -221,7 +230,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) {
221230

222231
// append data buffer
223232
MemoryUtil.copyMemory(
224-
deltaVector.getDataBuffer().memoryAddress(),
233+
deltaVector.getDataBuffer().memoryAddress() + deltaDataStart,
225234
targetVector.getDataBuffer().memoryAddress() + targetDataSize,
226235
deltaDataSize);
227236

@@ -232,7 +241,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) {
232241
+ (targetVector.getValueCount() + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH,
233242
deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH);
234243

235-
// increase each offset from the second buffer
244+
// rebase each appended offset to the target's data, accounting for the delta's start offset
236245
for (int i = 0; i < deltaVector.getValueCount(); i++) {
237246
long oldOffset =
238247
targetVector
@@ -245,7 +254,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) {
245254
.setLong(
246255
(long) (targetVector.getValueCount() + 1 + i)
247256
* BaseLargeVariableWidthVector.OFFSET_WIDTH,
248-
oldOffset + targetDataSize);
257+
oldOffset - deltaDataStart + targetDataSize);
249258
}
250259
((BaseLargeVariableWidthVector) targetVector).setLastSet(newValueCount - 1);
251260
targetVector.setValueCount(newValueCount);

vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@
2626
import java.util.List;
2727
import java.util.stream.IntStream;
2828
import java.util.stream.Stream;
29+
import org.apache.arrow.memory.ArrowBuf;
2930
import org.apache.arrow.memory.BufferAllocator;
3031
import org.apache.arrow.memory.RootAllocator;
3132
import org.apache.arrow.memory.util.CommonUtil;
33+
import org.apache.arrow.vector.BaseLargeVariableWidthVector;
3234
import org.apache.arrow.vector.BaseValueVector;
35+
import org.apache.arrow.vector.BaseVariableWidthVector;
3336
import org.apache.arrow.vector.BaseVariableWidthViewVector;
3437
import org.apache.arrow.vector.BigIntVector;
3538
import org.apache.arrow.vector.BitVector;
@@ -53,6 +56,7 @@
5356
import org.apache.arrow.vector.holders.NullableBigIntHolder;
5457
import org.apache.arrow.vector.holders.NullableFloat4Holder;
5558
import org.apache.arrow.vector.holders.NullableIntHolder;
59+
import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
5660
import org.apache.arrow.vector.testing.ValueVectorDataPopulator;
5761
import org.apache.arrow.vector.types.Types;
5862
import org.apache.arrow.vector.types.pojo.ArrowType;
@@ -178,6 +182,82 @@ public void testAppendVariableWidthVector() {
178182
}
179183
}
180184

185+
@Test
186+
public void testAppendVariableWidthVectorWithNonZeroStartOffset() {
187+
try (VarCharVector target = new VarCharVector("", allocator);
188+
VarCharVector delta = new VarCharVector("", allocator)) {
189+
190+
target.allocateNew(64, 4);
191+
ValueVectorDataPopulator.setVector(target, "a0", "a1");
192+
193+
// Build a delta vector whose offset buffer does not start at zero, as produced e.g. by
194+
// importing a sliced array through the C data interface. The values are "BBBB" and
195+
// "CCCC"; the data buffer additionally holds 4 bytes of unreferenced prefix ("AAAA").
196+
try (ArrowBuf validity = allocator.buffer(1);
197+
ArrowBuf offsets = allocator.buffer(12);
198+
ArrowBuf data = allocator.buffer(12)) {
199+
validity.setByte(0, 0b11);
200+
offsets.setInt(0, 4);
201+
offsets.setInt(4, 8);
202+
offsets.setInt(8, 12);
203+
data.setBytes(0, "AAAABBBBCCCC".getBytes(StandardCharsets.UTF_8));
204+
delta.loadFieldBuffers(new ArrowFieldNode(2, 0), Arrays.asList(validity, offsets, data));
205+
}
206+
207+
VectorAppender appender = new VectorAppender(target);
208+
delta.accept(appender, null);
209+
210+
// the unreferenced prefix must not be appended
211+
assertEquals(
212+
4 + 8,
213+
target
214+
.getOffsetBuffer()
215+
.getInt((long) target.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH));
216+
217+
try (VarCharVector expected = new VarCharVector("expected", allocator)) {
218+
expected.allocateNew();
219+
ValueVectorDataPopulator.setVector(expected, "a0", "a1", "BBBB", "CCCC");
220+
assertVectorsEqual(expected, target);
221+
}
222+
}
223+
}
224+
225+
@Test
226+
public void testAppendLargeVariableWidthVectorWithNonZeroStartOffset() {
227+
try (LargeVarCharVector target = new LargeVarCharVector("", allocator);
228+
LargeVarCharVector delta = new LargeVarCharVector("", allocator)) {
229+
230+
target.allocateNew(64, 4);
231+
ValueVectorDataPopulator.setVector(target, "a0", "a1");
232+
233+
try (ArrowBuf validity = allocator.buffer(1);
234+
ArrowBuf offsets = allocator.buffer(24);
235+
ArrowBuf data = allocator.buffer(12)) {
236+
validity.setByte(0, 0b11);
237+
offsets.setLong(0, 4);
238+
offsets.setLong(8, 8);
239+
offsets.setLong(16, 12);
240+
data.setBytes(0, "AAAABBBBCCCC".getBytes(StandardCharsets.UTF_8));
241+
delta.loadFieldBuffers(new ArrowFieldNode(2, 0), Arrays.asList(validity, offsets, data));
242+
}
243+
244+
VectorAppender appender = new VectorAppender(target);
245+
delta.accept(appender, null);
246+
247+
assertEquals(
248+
4 + 8,
249+
target
250+
.getOffsetBuffer()
251+
.getLong((long) target.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH));
252+
253+
try (LargeVarCharVector expected = new LargeVarCharVector("expected", allocator)) {
254+
expected.allocateNew();
255+
ValueVectorDataPopulator.setVector(expected, "a0", "a1", "BBBB", "CCCC");
256+
assertVectorsEqual(expected, target);
257+
}
258+
}
259+
}
260+
181261
@Test
182262
public void testAppendVariableWidthViewVector() {
183263
final int length1 = 10;

0 commit comments

Comments
 (0)