From 6cd1f92c6fcbaa3b9f3a66d372bf6cfd53088582 Mon Sep 17 00:00:00 2001 From: Neelesh Salian Date: Sat, 23 May 2026 13:53:21 -0700 Subject: [PATCH] API: Harden variant binary parsing against malformed input Co-authored-by: Steve Loughran --- .../iceberg/variants/SerializedArray.java | 30 +- .../iceberg/variants/SerializedMetadata.java | 35 +- .../iceberg/variants/SerializedObject.java | 55 +++- .../iceberg/variants/SerializedPrimitive.java | 50 +++ .../variants/SerializedShortString.java | 4 + .../apache/iceberg/variants/VariantUtil.java | 29 ++ .../apache/iceberg/variants/VariantValue.java | 15 +- .../variants/TestMalformedVariant.java | 303 ++++++++++++++++++ .../iceberg/variants/TestSerializedArray.java | 32 +- .../variants/TestSerializedMetadata.java | 10 +- .../variants/TestSerializedObject.java | 4 +- 11 files changed, 521 insertions(+), 46 deletions(-) create mode 100644 api/src/test/java/org/apache/iceberg/variants/TestMalformedVariant.java diff --git a/api/src/main/java/org/apache/iceberg/variants/SerializedArray.java b/api/src/main/java/org/apache/iceberg/variants/SerializedArray.java index b390d96f31ed..6657b733c731 100644 --- a/api/src/main/java/org/apache/iceberg/variants/SerializedArray.java +++ b/api/src/main/java/org/apache/iceberg/variants/SerializedArray.java @@ -35,12 +35,16 @@ static SerializedArray from(VariantMetadata metadata, byte[] bytes) { } static SerializedArray from(VariantMetadata metadata, ByteBuffer value, int header) { + return from(metadata, value, header, 0); + } + + static SerializedArray from(VariantMetadata metadata, ByteBuffer value, int header, int depth) { Preconditions.checkArgument( value.order() == ByteOrder.LITTLE_ENDIAN, "Unsupported byte order: big endian"); BasicType basicType = VariantUtil.basicType(header); Preconditions.checkArgument( basicType == BasicType.ARRAY, "Invalid array, basic type: " + basicType); - return new SerializedArray(metadata, value, header); + return new SerializedArray(metadata, value, header, depth); } private final VariantMetadata metadata; @@ -48,15 +52,27 @@ static SerializedArray from(VariantMetadata metadata, ByteBuffer value, int head private final int offsetSize; private final int offsetListOffset; private final int dataOffset; + private final int depth; private final VariantValue[] array; - private SerializedArray(VariantMetadata metadata, ByteBuffer value, int header) { + private SerializedArray(VariantMetadata metadata, ByteBuffer value, int header, int depth) { this.metadata = metadata; this.value = value; + this.depth = depth; this.offsetSize = 1 + ((header & OFFSET_SIZE_MASK) >> OFFSET_SIZE_SHIFT); int numElementsSize = ((header & IS_LARGE) == IS_LARGE) ? 4 : 1; + Preconditions.checkArgument( + value.remaining() >= HEADER_SIZE + numElementsSize, + "Invalid variant array: buffer too small for element count field"); int numElements = VariantUtil.readLittleEndianUnsigned(value, HEADER_SIZE, numElementsSize); + Preconditions.checkArgument( + numElements >= 0, "Invalid variant array: negative element count %s", numElements); this.offsetListOffset = HEADER_SIZE + numElementsSize; + long offsetTableEnd = (long) offsetListOffset + ((long) numElements + 1L) * offsetSize; + Preconditions.checkArgument( + offsetTableEnd <= value.remaining(), + "Invalid variant array: element count %s exceeds buffer", + numElements); this.dataOffset = offsetListOffset + ((1 + numElements) * offsetSize); this.array = new VariantValue[numElements]; } @@ -75,8 +91,16 @@ public VariantValue get(int index) { int next = VariantUtil.readLittleEndianUnsigned( value, offsetListOffset + (offsetSize * (1 + index)), offsetSize); + long dataLen = value.remaining() - (long) dataOffset; + Preconditions.checkArgument( + offset >= 0 && next >= offset && next <= dataLen, + "Invalid variant array: offset range [%s, %s] out of data region [0, %s]", + offset, + next, + dataLen); array[index] = - VariantValue.from(metadata, VariantUtil.slice(value, dataOffset + offset, next - offset)); + VariantUtil.fromBuffer( + metadata, VariantUtil.slice(value, dataOffset + offset, next - offset), depth + 1); } return array[index]; } diff --git a/api/src/main/java/org/apache/iceberg/variants/SerializedMetadata.java b/api/src/main/java/org/apache/iceberg/variants/SerializedMetadata.java index 9113a8c1c969..749168f8286f 100644 --- a/api/src/main/java/org/apache/iceberg/variants/SerializedMetadata.java +++ b/api/src/main/java/org/apache/iceberg/variants/SerializedMetadata.java @@ -58,15 +58,32 @@ static SerializedMetadata from(ByteBuffer metadata) { private SerializedMetadata(ByteBuffer metadata, int header) { this.isSorted = (header & SORTED_STRINGS) == SORTED_STRINGS; this.offsetSize = 1 + ((header & OFFSET_SIZE_MASK) >> OFFSET_SIZE_SHIFT); + Preconditions.checkArgument( + metadata.remaining() >= HEADER_SIZE + offsetSize, + "Invalid variant metadata: buffer too small for dictionary size field"); int dictSize = VariantUtil.readLittleEndianUnsigned(metadata, HEADER_SIZE, offsetSize); - this.dict = new String[dictSize]; + Preconditions.checkArgument( + dictSize >= 0, "Invalid variant metadata: negative dictionary size %s", dictSize); this.offsetListOffset = HEADER_SIZE + offsetSize; + long offsetTableEnd = (long) offsetListOffset + ((long) dictSize + 1L) * offsetSize; + Preconditions.checkArgument( + offsetTableEnd <= metadata.remaining(), + "Invalid variant metadata: dictionary size %s exceeds buffer", + dictSize); + this.dict = new String[dictSize]; this.dataOffset = offsetListOffset + ((1 + dictSize) * offsetSize); - int endOffset = - dataOffset - + VariantUtil.readLittleEndianUnsigned( - metadata, offsetListOffset + (offsetSize * dictSize), offsetSize); - if (endOffset < metadata.limit()) { + int lastOffset = + VariantUtil.readLittleEndianUnsigned( + metadata, offsetListOffset + (offsetSize * dictSize), offsetSize); + Preconditions.checkArgument( + lastOffset >= 0, "Invalid variant metadata: negative end offset %s", lastOffset); + long endOffsetLong = (long) dataOffset + lastOffset; + Preconditions.checkArgument( + endOffsetLong <= metadata.remaining(), + "Invalid variant metadata: end offset %s exceeds buffer", + endOffsetLong); + int endOffset = (int) endOffsetLong; + if (endOffset < metadata.remaining()) { this.metadata = VariantUtil.slice(metadata, 0, endOffset); } else { this.metadata = metadata; @@ -111,6 +128,12 @@ public String get(int index) { int next = VariantUtil.readLittleEndianUnsigned( metadata, offsetListOffset + (offsetSize * (1 + index)), offsetSize); + Preconditions.checkArgument( + offset >= 0 && next >= offset && (long) dataOffset + next <= metadata.remaining(), + "Invalid variant metadata: dict entry %s offset range [%s, %s] invalid", + index, + offset, + next); dict[index] = VariantUtil.readString(metadata, dataOffset + offset, next - offset); } return dict[index]; diff --git a/api/src/main/java/org/apache/iceberg/variants/SerializedObject.java b/api/src/main/java/org/apache/iceberg/variants/SerializedObject.java index d74565891a8a..562b56b944ca 100644 --- a/api/src/main/java/org/apache/iceberg/variants/SerializedObject.java +++ b/api/src/main/java/org/apache/iceberg/variants/SerializedObject.java @@ -41,12 +41,16 @@ static SerializedObject from(VariantMetadata metadata, byte[] bytes) { } static SerializedObject from(VariantMetadata metadata, ByteBuffer value, int header) { + return from(metadata, value, header, 0); + } + + static SerializedObject from(VariantMetadata metadata, ByteBuffer value, int header, int depth) { Preconditions.checkArgument( value.order() == ByteOrder.LITTLE_ENDIAN, "Unsupported byte order: big endian"); BasicType basicType = VariantUtil.basicType(header); Preconditions.checkArgument( basicType == BasicType.OBJECT, "Invalid object, basic type: " + basicType); - return new SerializedObject(metadata, value, header); + return new SerializedObject(metadata, value, header, depth); } private final VariantMetadata metadata; @@ -59,18 +63,33 @@ static SerializedObject from(VariantMetadata metadata, ByteBuffer value, int hea private final int[] offsets; private final int[] lengths; private final int dataOffset; + private final int depth; private final VariantValue[] values; - private SerializedObject(VariantMetadata metadata, ByteBuffer value, int header) { + private SerializedObject(VariantMetadata metadata, ByteBuffer value, int header, int depth) { this.metadata = metadata; this.value = value; + this.depth = depth; this.offsetSize = 1 + ((header & OFFSET_SIZE_MASK) >> OFFSET_SIZE_SHIFT); this.fieldIdSize = 1 + ((header & FIELD_ID_SIZE_MASK) >> FIELD_ID_SIZE_SHIFT); int numElementsSize = ((header & IS_LARGE) == IS_LARGE) ? 4 : 1; + Preconditions.checkArgument( + value.remaining() >= HEADER_SIZE + numElementsSize, + "Invalid variant object: buffer too small for element count field"); int numElements = VariantUtil.readLittleEndianUnsigned(value, HEADER_SIZE, numElementsSize); + Preconditions.checkArgument( + numElements >= 0, "Invalid variant object: negative element count %s", numElements); this.fieldIdListOffset = HEADER_SIZE + numElementsSize; - this.fieldIds = new Integer[numElements]; + long dataStart = + (long) fieldIdListOffset + + (long) numElements * fieldIdSize + + ((long) numElements + 1L) * offsetSize; + Preconditions.checkArgument( + dataStart <= value.remaining(), + "Invalid variant object: element count %s exceeds buffer", + numElements); this.offsetListOffset = fieldIdListOffset + (numElements * fieldIdSize); + this.fieldIds = new Integer[numElements]; this.offsets = new int[numElements]; this.lengths = new int[numElements]; this.dataOffset = offsetListOffset + ((1 + numElements) * offsetSize); @@ -95,11 +114,26 @@ private void initOffsetsAndLengths(int numElements) { int dataLength = VariantUtil.readLittleEndianUnsigned( value, offsetListOffset + (numElements * offsetSize), offsetSize); + long dataLen = value.remaining() - (long) dataOffset; + Preconditions.checkArgument( + dataLength >= 0 && dataLength <= dataLen, + "Invalid variant object: data length %s out of data region [0, %s]", + dataLength, + dataLen); + for (int index = 0; index < numElements; index += 1) { + Preconditions.checkArgument( + offsets[index] >= 0 && offsets[index] <= dataLength, + "Invalid variant object: offset %s out of declared data length %s", + offsets[index], + dataLength); + } offsetToLength.put(dataLength, 0); // populate lengths list by sorting offsets List sortedOffsets = offsetToLength.keySet().stream().sorted().collect(Collectors.toList()); + Preconditions.checkArgument( + sortedOffsets.size() == numElements + 1, "Invalid variant object: duplicate field offsets"); for (int index = 0; index < numElements; index += 1) { int offset = sortedOffsets.get(index); int length = sortedOffsets.get(index + 1) - offset; @@ -162,9 +196,16 @@ public String next() { private int id(int index) { if (null == fieldIds[index]) { - fieldIds[index] = + int dictSize = metadata.dictionarySize(); + int id = VariantUtil.readLittleEndianUnsigned( value, fieldIdListOffset + (index * fieldIdSize), fieldIdSize); + Preconditions.checkArgument( + id >= 0 && id < dictSize, + "Invalid variant object: field id %s out of range [0, %s)", + id, + dictSize); + fieldIds[index] = id; } return fieldIds[index]; @@ -181,8 +222,10 @@ public VariantValue get(String name) { if (null == values[index]) { values[index] = - VariantValue.from( - metadata, VariantUtil.slice(value, dataOffset + offsets[index], lengths[index])); + VariantUtil.fromBuffer( + metadata, + VariantUtil.slice(value, dataOffset + offsets[index], lengths[index]), + depth + 1); } return values[index]; diff --git a/api/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java b/api/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java index 840a210ac0b7..3725b05e9c75 100644 --- a/api/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java +++ b/api/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java @@ -50,6 +50,56 @@ static SerializedPrimitive from(ByteBuffer value, int header) { private SerializedPrimitive(ByteBuffer value, int header) { this.value = value; this.type = PhysicalType.from(header >> PRIMITIVE_TYPE_SHIFT); + long requiredBytes = PRIMITIVE_OFFSET + payloadSize(type, value); + Preconditions.checkArgument( + requiredBytes <= value.remaining(), + "Invalid variant primitive: %s payload extends past buffer", + type); + } + + private static long payloadSize(PhysicalType type, ByteBuffer value) { + switch (type) { + case NULL: + case BOOLEAN_TRUE: + case BOOLEAN_FALSE: + return 0; + case INT8: + return 1; + case INT16: + return 2; + case INT32: + case DATE: + case FLOAT: + return 4; + case INT64: + case TIMESTAMPTZ: + case TIMESTAMPNTZ: + case TIME: + case TIMESTAMPTZ_NANOS: + case TIMESTAMPNTZ_NANOS: + case DOUBLE: + return 8; + case DECIMAL4: + return 5; + case DECIMAL8: + return 9; + case DECIMAL16: + return 17; + case UUID: + return 16; + case BINARY: + case STRING: + Preconditions.checkArgument( + PRIMITIVE_OFFSET + 4 <= value.remaining(), + "Invalid variant primitive: %s size field extends past buffer", + type); + int size = VariantUtil.readLittleEndianInt32(value, PRIMITIVE_OFFSET); + Preconditions.checkArgument( + size >= 0, "Invalid variant primitive: negative %s size %s", type, size); + return 4L + size; + } + + throw new UnsupportedOperationException("Unsupported primitive type: " + type); } private Object read() { diff --git a/api/src/main/java/org/apache/iceberg/variants/SerializedShortString.java b/api/src/main/java/org/apache/iceberg/variants/SerializedShortString.java index 90fb54f3b13e..15124b363d9a 100644 --- a/api/src/main/java/org/apache/iceberg/variants/SerializedShortString.java +++ b/api/src/main/java/org/apache/iceberg/variants/SerializedShortString.java @@ -47,6 +47,10 @@ static SerializedShortString from(ByteBuffer value, int header) { private SerializedShortString(ByteBuffer value, int header) { this.value = value; this.length = ((header & LENGTH_MASK) >> LENGTH_SHIFT); + Preconditions.checkArgument( + HEADER_SIZE + length <= value.remaining(), + "Invalid variant short string: length %s exceeds buffer", + length); } @Override diff --git a/api/src/main/java/org/apache/iceberg/variants/VariantUtil.java b/api/src/main/java/org/apache/iceberg/variants/VariantUtil.java index d4335df8b567..ae2cb2627b70 100644 --- a/api/src/main/java/org/apache/iceberg/variants/VariantUtil.java +++ b/api/src/main/java/org/apache/iceberg/variants/VariantUtil.java @@ -31,8 +31,37 @@ class VariantUtil { private static final int BASIC_TYPE_OBJECT = 2; private static final int BASIC_TYPE_ARRAY = 3; + /** + * Maximum permitted nesting depth of a Variant value. The top-level value is depth 0, so a + * Variant may contain up to {@code MAX_VARIANT_DEPTH} nested levels. + */ + static final int MAX_VARIANT_DEPTH = 500; + private VariantUtil() {} + static VariantValue fromBuffer(VariantMetadata metadata, ByteBuffer value, int depth) { + Preconditions.checkArgument( + depth <= MAX_VARIANT_DEPTH, + "Invalid variant: nesting depth %s exceeds maximum %s", + depth, + MAX_VARIANT_DEPTH); + Preconditions.checkArgument(value.remaining() >= 1, "Invalid variant: empty value buffer"); + int header = readByte(value, 0); + BasicType basicType = basicType(header); + switch (basicType) { + case PRIMITIVE: + return SerializedPrimitive.from(value, header); + case SHORT_STRING: + return SerializedShortString.from(value, header); + case OBJECT: + return SerializedObject.from(metadata, value, header, depth); + case ARRAY: + return SerializedArray.from(metadata, value, header, depth); + } + + throw new UnsupportedOperationException("Unsupported basic type: " + basicType); + } + /** A hacky absolute put for ByteBuffer */ static int writeBufferAbsolute(ByteBuffer buffer, int offset, ByteBuffer toCopy) { int originalPosition = buffer.position(); diff --git a/api/src/main/java/org/apache/iceberg/variants/VariantValue.java b/api/src/main/java/org/apache/iceberg/variants/VariantValue.java index 1cda7b2d3ca2..ea27857e2908 100644 --- a/api/src/main/java/org/apache/iceberg/variants/VariantValue.java +++ b/api/src/main/java/org/apache/iceberg/variants/VariantValue.java @@ -61,19 +61,6 @@ default VariantArray asArray() { } static VariantValue from(VariantMetadata metadata, ByteBuffer value) { - int header = VariantUtil.readByte(value, 0); - BasicType basicType = VariantUtil.basicType(header); - switch (basicType) { - case PRIMITIVE: - return SerializedPrimitive.from(value, header); - case SHORT_STRING: - return SerializedShortString.from(value, header); - case OBJECT: - return SerializedObject.from(metadata, value, header); - case ARRAY: - return SerializedArray.from(metadata, value, header); - } - - throw new UnsupportedOperationException("Unsupported basic type: " + basicType); + return VariantUtil.fromBuffer(metadata, value, 0); } } diff --git a/api/src/test/java/org/apache/iceberg/variants/TestMalformedVariant.java b/api/src/test/java/org/apache/iceberg/variants/TestMalformedVariant.java new file mode 100644 index 000000000000..7c79bd22a170 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/variants/TestMalformedVariant.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import org.junit.jupiter.api.Test; + +public class TestMalformedVariant { + + private static final ByteBuffer EMPTY_METADATA = + ByteBuffer.wrap(new byte[] {0x01, 0x00, 0x00}).order(ByteOrder.LITTLE_ENDIAN); + + @Test + public void testOversizedMetadataDictSize() { + byte[] bytes = new byte[] {0x01, (byte) 0xFF}; + + assertThatThrownBy(() -> SerializedMetadata.from(bytes)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("dictionary size"); + } + + @Test + public void testOversizedArrayNumElements() { + ByteBuffer value = + ByteBuffer.wrap(new byte[] {(byte) 0b10011, 0x00, 0x00, 0x00, 0x10, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("element count"); + } + + @Test + public void testOversizedObjectNumElements() { + ByteBuffer value = + ByteBuffer.wrap(new byte[] {(byte) 0b1000010, 0x00, 0x00, 0x00, 0x10, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("element count"); + } + + @Test + public void testFieldIdOutOfRange() { + // field id check is lazy, so descend via get() to trigger it + ByteBuffer value = + ByteBuffer.wrap(new byte[] {0x02, 0x01, 0x05, 0x00, 0x01, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + VariantValue top = VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value); + + assertThatThrownBy(() -> top.asObject().get("anything")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("field id"); + } + + @Test + public void testOutOfRangeChildOffsetInArray() { + ByteBuffer value = + ByteBuffer.wrap(new byte[] {0b0011, 0x01, (byte) 0xFF, 0x01, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + VariantValue top = VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value); + + assertThatThrownBy(() -> top.asArray().get(0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("out of data region"); + } + + @Test + public void testMalformedChildOffsetCaughtOnDescent() { + // outer array constructs cleanly. Inner element is rejected only when accessed + ByteBuffer value = + ByteBuffer.wrap( + new byte[] {0b0011, 0x01, 0x00, 0x06, (byte) 0b10011, 0x00, 0x00, 0x00, 0x10, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + VariantValue top = VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value); + + assertThatThrownBy(() -> top.asArray().get(0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("element count"); + } + + @Test + public void testNegativeMetadataEndOffset() { + byte[] metadata = { + (byte) 0b11000001, 0x00, 0x00, 0x00, 0x00, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF + }; + + assertThatThrownBy(() -> SerializedMetadata.from(metadata)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("negative end offset"); + } + + @Test + public void testNegativeDictOffsetInGet() { + byte[] metadata = { + (byte) 0b11000001, + 0x01, + 0x00, + 0x00, + 0x00, + (byte) 0xFF, + (byte) 0xFF, + (byte) 0xFF, + (byte) 0xFF, + 0x00, + 0x00, + 0x00, + 0x00 + }; + + SerializedMetadata parsed = SerializedMetadata.from(metadata); + + assertThatThrownBy(() -> parsed.get(0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("dict entry"); + } + + @Test + public void testOversizedPrimitiveStringSize() { + ByteBuffer value = + ByteBuffer.wrap(new byte[] {(byte) 0b1000000, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, 0x7F}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("payload"); + } + + @Test + public void testOffsetExceedsDeclaredDataLengthInObject() { + byte[] metadata = {0x01, 0x01, 0x00, 0x01, 'a'}; + ByteBuffer value = + ByteBuffer.wrap(new byte[] {0x02, 0x01, 0x00, 0x32, 0x00, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(metadata), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("declared data length"); + } + + @Test + public void testOffsetArithmeticOverflowInObject() { + // numElements * offsetSize would overflow signed int + ByteBuffer value = + ByteBuffer.wrap(new byte[] {0x4E, 0x33, 0x33, 0x33, 0x33, (byte) 0xFF, (byte) 0xFF, 0x3F}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("element count"); + } + + @Test + public void testNonMonotonicDictOffset() { + byte[] metadata = {0x01, 0x02, 0x00, 0x05, 0x03, 'A', 'B', 'C', 'D', 'E'}; + + SerializedMetadata parsed = SerializedMetadata.from(metadata); + + assertThatThrownBy(() -> parsed.get(1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("dict entry"); + } + + @Test + public void testDuplicateFieldOffsetsInObject() { + byte[] metadata = {0x01, 0x01, 0x00, 0x01, 'a'}; + ByteBuffer value = + ByteBuffer.wrap(new byte[] {0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(metadata), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("duplicate field offsets"); + } + + @Test + public void testEmptyChildValueBufferInArray() { + // offset[0] == offset[1] produces a zero-length child slice + ByteBuffer value = + ByteBuffer.wrap(new byte[] {0b0011, 0x01, 0x00, 0x00}).order(ByteOrder.LITTLE_ENDIAN); + + VariantValue top = VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value); + + assertThatThrownBy(() -> top.asArray().get(0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("empty value buffer"); + } + + @Test + public void testShortStringLengthExceedsBuffer() { + // header declares length=7, but only the header byte is present + ByteBuffer value = ByteBuffer.wrap(new byte[] {0x1D}).order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("length"); + } + + @Test + public void testNegativeObjectNumElements() { + // largeSize object header, four 0xFF bytes read as int32 -1 + ByteBuffer value = + ByteBuffer.wrap( + new byte[] {(byte) 0b1000010, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}) + .order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("negative element count"); + } + + @Test + public void testNegativeMetadataDictSize() { + // offsetSize=4, four 0xFF bytes read as int32 -1 + byte[] metadata = {(byte) 0b11000001, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + + assertThatThrownBy(() -> SerializedMetadata.from(metadata)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("negative dictionary size"); + } + + @Test + public void testOversizedMetadataEndOffset() { + // dictSize=0, lastOffset=0xFF claims 255 bytes of dict data past the 3-byte buffer + byte[] metadata = {0x01, 0x00, (byte) 0xFF}; + + assertThatThrownBy(() -> SerializedMetadata.from(metadata)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("end offset"); + } + + @Test + public void testTruncatedPrimitiveSizeField() { + // string header but buffer is too short to contain the 4-byte size field + ByteBuffer value = + ByteBuffer.wrap(new byte[] {(byte) 0b1000000, 0x00}).order(ByteOrder.LITTLE_ENDIAN); + + assertThatThrownBy(() -> VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("size field"); + } + + @Test + public void testOverdeepNestingRejected() { + int chainDepth = VariantUtil.MAX_VARIANT_DEPTH + 1; + byte[] bytes = buildNestedArrayChain(chainDepth); + ByteBuffer value = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + + VariantValue top = VariantValue.from(SerializedMetadata.from(EMPTY_METADATA), value); + VariantArray cursor = top.asArray(); + for (int i = 0; i < VariantUtil.MAX_VARIANT_DEPTH; i += 1) { + cursor = cursor.get(0).asArray(); + } + + VariantArray finalCursor = cursor; + assertThatThrownBy(() -> finalCursor.get(0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("nesting depth"); + } + + private static byte[] buildNestedArrayChain(int arrayLevels) { + final byte arrayHeader = 0b0111; // small array, offsetSize=2 + final int perLevel = 6; + final int leafSize = 1; + int total = perLevel * arrayLevels + leafSize; + byte[] buf = new byte[total]; + for (int i = 0; i < arrayLevels; i += 1) { + int pos = i * perLevel; + int innerLen = total - pos - perLevel; + buf[pos] = arrayHeader; + buf[pos + 1] = 0x01; + buf[pos + 2] = 0x00; + buf[pos + 3] = 0x00; + buf[pos + 4] = (byte) (innerLen & 0xFF); + buf[pos + 5] = (byte) ((innerLen >> 8) & 0xFF); + } + buf[total - 1] = 0x00; + return buf; + } +} diff --git a/api/src/test/java/org/apache/iceberg/variants/TestSerializedArray.java b/api/src/test/java/org/apache/iceberg/variants/TestSerializedArray.java index 5d53c82a317f..67e923c11b78 100644 --- a/api/src/test/java/org/apache/iceberg/variants/TestSerializedArray.java +++ b/api/src/test/java/org/apache/iceberg/variants/TestSerializedArray.java @@ -22,6 +22,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Random; import org.apache.iceberg.util.RandomUtil; import org.junit.jupiter.api.Test; @@ -55,7 +56,7 @@ public class TestSerializedArray { @Test public void testEmptyArray() { - SerializedArray array = SerializedArray.from(EMPTY_METADATA, new byte[] {0b0011, 0x00}); + SerializedArray array = SerializedArray.from(EMPTY_METADATA, new byte[] {0b0011, 0x00, 0x00}); assertThat(array.type()).isEqualTo(PhysicalType.ARRAY); assertThat(array.numElements()).isEqualTo(0); @@ -64,7 +65,7 @@ public void testEmptyArray() { @Test public void testEmptyLargeArray() { SerializedArray array = - SerializedArray.from(EMPTY_METADATA, new byte[] {0b10011, 0x00, 0x00, 0x00, 0x00}); + SerializedArray.from(EMPTY_METADATA, new byte[] {0b10011, 0x00, 0x00, 0x00, 0x00, 0x00}); assertThat(array.type()).isEqualTo(PhysicalType.ARRAY); assertThat(array.numElements()).isEqualTo(0); @@ -179,12 +180,25 @@ public void testMultiByteOffsets(int multiByteOffset) { @Test public void testLargeArraySize() { - SerializedArray array = - SerializedArray.from( - EMPTY_METADATA, new byte[] {0b10011, (byte) 0xFF, (byte) 0x01, 0x00, 0x00}); - + // largeSize array, offsetSize=2 (offsetSize=1 cannot represent offsets > 255), 511 NULL + // elements + int numElements = 511; + ByteBuffer buf = + ByteBuffer.allocate(1 + 4 + (numElements + 1) * 2 + numElements) + .order(ByteOrder.LITTLE_ENDIAN); + buf.put((byte) 0b10111); + buf.putInt(numElements); + for (int i = 0; i <= numElements; i += 1) { + buf.putShort((short) i); + } + for (int i = 0; i < numElements; i += 1) { + buf.put((byte) 0x00); // NULL primitive header + } + buf.flip(); + + SerializedArray array = SerializedArray.from(EMPTY_METADATA, buf, buf.get(0)); assertThat(array.type()).isEqualTo(PhysicalType.ARRAY); - assertThat(array.numElements()).isEqualTo(511); + assertThat(array.numElements()).isEqualTo(numElements); } @Test @@ -194,7 +208,7 @@ public void testNegativeArraySize() { SerializedArray.from( EMPTY_METADATA, new byte[] {0b10011, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF})) - .isInstanceOf(NegativeArraySizeException.class) - .hasMessage("-1"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("negative element count"); } } diff --git a/api/src/test/java/org/apache/iceberg/variants/TestSerializedMetadata.java b/api/src/test/java/org/apache/iceberg/variants/TestSerializedMetadata.java index 70b6a28b946d..3fdd28571741 100644 --- a/api/src/test/java/org/apache/iceberg/variants/TestSerializedMetadata.java +++ b/api/src/test/java/org/apache/iceberg/variants/TestSerializedMetadata.java @@ -235,20 +235,18 @@ public void testInvalidMetadataVersion() { } @Test - @SuppressWarnings("checkstyle:AssertThatThrownByWithMessageCheck") public void testMissingLength() { - // no check on the underlying error msg as it might be missing based on the JDK version assertThatThrownBy(() -> SerializedMetadata.from(new byte[] {0x01})) - .isInstanceOf(IndexOutOfBoundsException.class); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("buffer too small"); } @Test - @SuppressWarnings("checkstyle:AssertThatThrownByWithMessageCheck") public void testLengthTooShort() { // missing the 4th length byte - // no check on the underlying error msg as it might be missing based on the JDK version assertThatThrownBy( () -> SerializedMetadata.from(new byte[] {(byte) 0b11010001, 0x00, 0x00, 0x00})) - .isInstanceOf(IndexOutOfBoundsException.class); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("buffer too small"); } } diff --git a/api/src/test/java/org/apache/iceberg/variants/TestSerializedObject.java b/api/src/test/java/org/apache/iceberg/variants/TestSerializedObject.java index 07f7c9a2c5da..0cbd7ceaeede 100644 --- a/api/src/test/java/org/apache/iceberg/variants/TestSerializedObject.java +++ b/api/src/test/java/org/apache/iceberg/variants/TestSerializedObject.java @@ -68,7 +68,7 @@ public class TestSerializedObject { @Test public void testEmptyObject() { - SerializedObject object = SerializedObject.from(EMPTY_METADATA, new byte[] {0b10, 0x00}); + SerializedObject object = SerializedObject.from(EMPTY_METADATA, new byte[] {0b10, 0x00, 0x00}); assertThat(object.type()).isEqualTo(PhysicalType.OBJECT); assertThat(object.numFields()).isEqualTo(0); @@ -77,7 +77,7 @@ public void testEmptyObject() { @Test public void testEmptyLargeObject() { SerializedObject object = - SerializedObject.from(EMPTY_METADATA, new byte[] {0b1000010, 0x00, 0x00, 0x00, 0x00}); + SerializedObject.from(EMPTY_METADATA, new byte[] {0b1000010, 0x00, 0x00, 0x00, 0x00, 0x00}); assertThat(object.type()).isEqualTo(PhysicalType.OBJECT); assertThat(object.numFields()).isEqualTo(0);