Skip to content

Commit 636e75d

Browse files
committed
GH-3561 Harden variant decoding
- reject oversized metadata/value declarations - reject oversize dictSize in objects - range checking Only low cost checks are made. There's also a depth check consistent with the json parser; it's arguable as to whether that is needed. It will defend against StackOverflowExceptions by anything trying to treewalk, but should that code be the place to do the checks? The new test suite TestHardenedReader can be configured to actually emit the malformed files, to see how applications deal with them.
1 parent c7e7acb commit 636e75d

11 files changed

Lines changed: 862 additions & 133 deletions

File tree

parquet-variant/pom.xml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,17 @@
6969
<version>${slf4j.version}</version>
7070
<scope>test</scope>
7171
</dependency>
72+
<dependency>
73+
<groupId>org.apache.parquet</groupId>
74+
<artifactId>parquet-hadoop</artifactId>
75+
<version>${project.version}</version>
76+
<scope>test</scope>
77+
</dependency>
78+
<dependency>
79+
<groupId>org.apache.hadoop</groupId>
80+
<artifactId>hadoop-client</artifactId>
81+
<scope>test</scope>
82+
</dependency>
7283
</dependencies>
7384

7485
<build>

parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.math.BigDecimal;
2222
import java.nio.ByteBuffer;
2323
import java.util.UUID;
24+
import org.apache.parquet.Preconditions;
2425

2526
/**
2627
* This Variant class holds the Variant-encoded value and metadata binary values.
@@ -36,6 +37,12 @@ public final class Variant {
3637
*/
3738
final ByteBuffer metadata;
3839

40+
/** Number of entries in the metadata dictionary, cached from construction time. */
41+
private final int dictSize;
42+
43+
/** Nesting depth of this Variant relative to the top-level value (0 = top-level). */
44+
private final int depth;
45+
3946
/**
4047
* The threshold to switch from linear search to binary search when looking up a field by key in
4148
* an object. This is a performance optimization to avoid the overhead of binary search for a
@@ -60,13 +67,27 @@ public Variant(ByteBuffer value, ByteBuffer metadata) {
6067
// is not important.
6168
this.value = value.asReadOnlyBuffer();
6269
this.metadata = metadata.asReadOnlyBuffer();
70+
this.depth = 0;
71+
this.dictSize = VariantUtil.validateMetadata(this.metadata);
72+
VariantUtil.validateValueShallow(this.value, this.dictSize);
73+
}
6374

64-
// There is currently only one allowed version.
65-
if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) {
66-
throw new UnsupportedOperationException(String.format(
67-
"Unsupported variant metadata version: %d",
68-
metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
69-
}
75+
/**
76+
* Package-private constructor for a child Variant produced by slicing an already-validated
77+
* parent. The metadata buffer and {@code dictSize} are inherited from the parent (no
78+
* re-validation); the value slot is bounds-checked shallowly, and the depth counter is
79+
* propagated so that nesting depth is bounded.
80+
*/
81+
Variant(ByteBuffer value, ByteBuffer metadata, int dictSize, int depth) {
82+
Preconditions.checkArgument(
83+
depth <= VariantUtil.MAX_VARIANT_DEPTH,
84+
"variant nesting depth exceeds maximum %s",
85+
VariantUtil.MAX_VARIANT_DEPTH);
86+
this.value = value.asReadOnlyBuffer();
87+
this.metadata = metadata;
88+
this.dictSize = dictSize;
89+
this.depth = depth;
90+
VariantUtil.validateValueShallow(this.value, dictSize);
7091
}
7192

7293
public ByteBuffer getValueBuffer() {
@@ -219,7 +240,9 @@ public Variant getFieldByKey(String key) {
219240
info.offsetSize,
220241
value.position() + info.idStartOffset,
221242
value.position() + info.offsetStartOffset,
222-
value.position() + info.dataStartOffset);
243+
value.position() + info.dataStartOffset,
244+
dictSize,
245+
depth + 1);
223246
if (field.key.equals(key)) {
224247
return field.value;
225248
}
@@ -240,7 +263,9 @@ public Variant getFieldByKey(String key) {
240263
info.offsetSize,
241264
value.position() + info.idStartOffset,
242265
value.position() + info.offsetStartOffset,
243-
value.position() + info.dataStartOffset);
266+
value.position() + info.dataStartOffset,
267+
dictSize,
268+
depth + 1);
244269
int cmp = field.key.compareTo(key);
245270
if (cmp < 0) {
246271
low = mid + 1;
@@ -286,7 +311,9 @@ public ObjectField getFieldAtIndex(int idx) {
286311
info.offsetSize,
287312
value.position() + info.idStartOffset,
288313
value.position() + info.offsetStartOffset,
289-
value.position() + info.dataStartOffset);
314+
value.position() + info.dataStartOffset,
315+
dictSize,
316+
depth + 1);
290317
return field;
291318
}
292319

@@ -298,12 +325,14 @@ static ObjectField getFieldAtIndex(
298325
int offsetSize,
299326
int idStart,
300327
int offsetStart,
301-
int dataStart) {
328+
int dataStart,
329+
int dictSize,
330+
int childDepth) {
302331
// idStart, offsetStart, and dataStart are absolute positions in the `value` buffer.
303332
int id = VariantUtil.readUnsigned(value, idStart + idSize * index, idSize);
304333
int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
305334
String key = VariantUtil.getMetadataKey(metadata, id);
306-
Variant v = new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
335+
Variant v = new Variant(VariantUtil.slice(value, dataStart + offset), metadata, dictSize, childDepth);
307336
return new ObjectField(key, v);
308337
}
309338

@@ -334,13 +363,22 @@ public Variant getElementAtIndex(int index) {
334363
metadata,
335364
info.offsetSize,
336365
value.position() + info.offsetStartOffset,
337-
value.position() + info.dataStartOffset);
366+
value.position() + info.dataStartOffset,
367+
dictSize,
368+
depth + 1);
338369
}
339370

340371
private static Variant getElementAtIndex(
341-
int index, ByteBuffer value, ByteBuffer metadata, int offsetSize, int offsetStart, int dataStart) {
372+
int index,
373+
ByteBuffer value,
374+
ByteBuffer metadata,
375+
int offsetSize,
376+
int offsetStart,
377+
int dataStart,
378+
int dictSize,
379+
int childDepth) {
342380
// offsetStart and dataStart are absolute positions in the `value` buffer.
343381
int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
344-
return new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
382+
return new Variant(VariantUtil.slice(value, dataStart + offset), metadata, dictSize, childDepth);
345383
}
346384
}

parquet-variant/src/main/java/org/apache/parquet/variant/VariantJsonParser.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
*/
1717
package org.apache.parquet.variant;
1818

19+
import static org.apache.parquet.variant.VariantUtil.MAX_VARIANT_DEPTH;
20+
1921
import com.fasterxml.jackson.core.JsonFactory;
2022
import com.fasterxml.jackson.core.JsonParseException;
2123
import com.fasterxml.jackson.core.JsonParser;
@@ -37,7 +39,7 @@ public final class VariantJsonParser {
3739

3840
private static final JsonFactory JSON_FACTORY = JsonFactory.builder()
3941
.streamReadConstraints(StreamReadConstraints.builder()
40-
.maxNestingDepth(500)
42+
.maxNestingDepth(MAX_VARIANT_DEPTH)
4143
.maxStringLength(10_000_000)
4244
.maxDocumentLength(50_000_000L)
4345
.build())

parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.nio.ByteOrder;
2323
import java.util.Arrays;
2424
import java.util.HashMap;
25+
import org.apache.parquet.Preconditions;
2526

2627
/**
2728
* This class defines constants related to the Variant format and provides functions for
@@ -188,6 +189,12 @@ class VariantUtil {
188189
// The size (in bytes) of a UUID.
189190
static final int UUID_SIZE = 16;
190191

192+
/**
193+
* Maximum permitted nesting depth of a Variant value.
194+
* same limit as in VariantJsonParser.
195+
*/
196+
static final int MAX_VARIANT_DEPTH = 500;
197+
191198
// header bytes
192199
static final byte HEADER_NULL = primitiveHeader(NULL);
193200
static final byte HEADER_LONG_STRING = primitiveHeader(LONG_STR);
@@ -851,6 +858,160 @@ static HashMap<String, Integer> getMetadataMap(ByteBuffer metadata) {
851858
return result;
852859
}
853860

861+
/**
862+
* Bounds-checks the metadata buffer: header version, dictionary offset table and string data
863+
* region all fit within the buffer extent. It does not perform any deep checks into
864+
* the metadata itself.
865+
*
866+
* @param metadata the variant metadata buffer
867+
* @return the dictionary size
868+
* @throws IllegalArgumentException if the metadata buffer is not well-formed
869+
*/
870+
static int validateMetadata(ByteBuffer metadata) {
871+
int pos = metadata.position();
872+
Preconditions.checkArgument(pos >= 0 && pos < metadata.limit(), "variant metadata is empty");
873+
int header = metadata.get(pos) & 0xFF;
874+
Preconditions.checkArgument(
875+
(header & VERSION_MASK) == VERSION, "Unsupported variant metadata version: %s", header & VERSION_MASK);
876+
int offsetSize = ((header >> 6) & 0x3) + 1;
877+
long remaining = (long) metadata.limit() - pos;
878+
long offsetListStart = 1L + offsetSize;
879+
Preconditions.checkArgument(offsetListStart <= remaining, "variant metadata truncated");
880+
int dictSize = readUnsigned(metadata, pos + 1, offsetSize);
881+
long offsetBytes = (long) (dictSize + 1) * offsetSize;
882+
long dataStart = offsetListStart + offsetBytes;
883+
Preconditions.checkArgument(
884+
dataStart <= remaining, "variant metadata dictionary table extends past buffer: dictSize=%s", dictSize);
885+
return dictSize;
886+
}
887+
888+
/**
889+
* Bounds-checks a single Variant value node against its buffer slot. Performs no recursion
890+
* into nested children: child nodes are checked on demand when callers descend into them.
891+
*
892+
* <p>Cost: O(1) for primitives and short strings, O(numElements) for objects and arrays.
893+
* Validation of nested structures is deferred so that opening a large well-formed Variant
894+
* is not penalised by sub-trees the caller never inspects.
895+
*
896+
* @param value the variant value buffer (position/limit define the extent of this node's slot)
897+
* @param dictSize the metadata dictionary size, used to bound object field ids
898+
* @throws IllegalArgumentException if the value header or container table does not fit within
899+
* the buffer slot, or if any object field id is out of range
900+
*/
901+
static void validateValueShallow(ByteBuffer value, int dictSize) {
902+
int s = value.position();
903+
Preconditions.checkArgument(s >= 0 && s < value.limit(), "variant value is empty");
904+
long slot = (long) value.limit() - s;
905+
int header = value.get(s) & 0xFF;
906+
int basicType = header & BASIC_TYPE_MASK;
907+
int typeInfo = (header >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK;
908+
switch (basicType) {
909+
case SHORT_STR:
910+
Preconditions.checkArgument(1L + typeInfo <= slot, "variant short string extends past buffer");
911+
return;
912+
case OBJECT:
913+
validateContainerShallow(value, s, slot, dictSize, true, typeInfo);
914+
return;
915+
case ARRAY:
916+
validateContainerShallow(value, s, slot, dictSize, false, typeInfo);
917+
return;
918+
default:
919+
validatePrimitiveShallow(value, s, slot, typeInfo);
920+
}
921+
}
922+
923+
private static void validateContainerShallow(
924+
ByteBuffer value, int s, long slot, int dictSize, boolean isObject, int typeInfo) {
925+
boolean largeSize;
926+
int idSize;
927+
if (isObject) {
928+
largeSize = ((typeInfo >> 4) & 0x1) != 0;
929+
idSize = ((typeInfo >> 2) & 0x3) + 1;
930+
} else {
931+
largeSize = ((typeInfo >> 2) & 0x1) != 0;
932+
idSize = 0;
933+
}
934+
int offsetSize = (typeInfo & 0x3) + 1;
935+
int sizeBytes = largeSize ? U32_SIZE : 1;
936+
Preconditions.checkArgument(1L + sizeBytes <= slot, "variant container header truncated");
937+
int numElements = readUnsigned(value, s + 1, sizeBytes);
938+
long idStart = 1L + sizeBytes;
939+
long idBytes = isObject ? (long) numElements * idSize : 0L;
940+
long offsetStart = idStart + idBytes;
941+
long offsetBytes = (long) (numElements + 1) * offsetSize;
942+
long dataStart = offsetStart + offsetBytes;
943+
Preconditions.checkArgument(
944+
dataStart <= slot, "variant container offset table extends past buffer: numElements=%s", numElements);
945+
long dataLen = slot - dataStart;
946+
if (isObject) {
947+
for (int i = 0; i < numElements; i++) {
948+
int id = readUnsigned(value, s + (int) idStart + i * idSize, idSize);
949+
Preconditions.checkArgument(
950+
id < dictSize, "variant object key id %s out of range (dictSize=%s)", id, dictSize);
951+
}
952+
}
953+
// Each child offset must lie within the data region. Children may overlap or leave gaps;
954+
// the trailing terminator offset is range-checked for the same reason.
955+
for (int i = 0; i <= numElements; i++) {
956+
// O(elements)
957+
int off = readUnsigned(value, s + (int) offsetStart + i * offsetSize, offsetSize);
958+
Preconditions.checkArgument(
959+
off <= dataLen, "variant child offset out of range: %s (data length %s)", off, dataLen);
960+
}
961+
}
962+
963+
private static void validatePrimitiveShallow(ByteBuffer value, int s, long slot, int typeInfo) {
964+
long size;
965+
switch (typeInfo) {
966+
case NULL:
967+
case TRUE:
968+
case FALSE:
969+
size = 1;
970+
break;
971+
case INT8:
972+
size = 2;
973+
break;
974+
case INT16:
975+
size = 3;
976+
break;
977+
case INT32:
978+
case DATE:
979+
case FLOAT:
980+
size = 5;
981+
break;
982+
case INT64:
983+
case DOUBLE:
984+
case TIMESTAMP_TZ:
985+
case TIMESTAMP_NTZ:
986+
case TIME:
987+
case TIMESTAMP_NANOS_TZ:
988+
case TIMESTAMP_NANOS_NTZ:
989+
size = 9;
990+
break;
991+
case DECIMAL4:
992+
size = 6;
993+
break;
994+
case DECIMAL8:
995+
size = 10;
996+
break;
997+
case DECIMAL16:
998+
size = 18;
999+
break;
1000+
case BINARY:
1001+
case LONG_STR: {
1002+
Preconditions.checkArgument(1L + U32_SIZE <= slot, "variant string/binary length field truncated");
1003+
size = 1L + U32_SIZE + readUnsigned(value, s + 1, U32_SIZE);
1004+
break;
1005+
}
1006+
case UUID:
1007+
size = 1L + UUID_SIZE;
1008+
break;
1009+
default:
1010+
throw new IllegalArgumentException(String.format("Unknown primitive type in variant: %d", typeInfo));
1011+
}
1012+
Preconditions.checkArgument(size <= slot, "variant value extends past buffer");
1013+
}
1014+
8541015
/**
8551016
* Computes the actual size (in bytes) of the Variant value.
8561017
* @param value The Variant value binary

0 commit comments

Comments
 (0)