|
22 | 22 | import java.nio.ByteOrder; |
23 | 23 | import java.util.Arrays; |
24 | 24 | import java.util.HashMap; |
| 25 | +import org.apache.parquet.Preconditions; |
25 | 26 |
|
26 | 27 | /** |
27 | 28 | * This class defines constants related to the Variant format and provides functions for |
@@ -188,6 +189,12 @@ class VariantUtil { |
188 | 189 | // The size (in bytes) of a UUID. |
189 | 190 | static final int UUID_SIZE = 16; |
190 | 191 |
|
| 192 | + /** |
| 193 | + * Maximum permitted nesting depth of a Variant value. |
| 194 | + * same limit as in VariantJsonParser. |
| 195 | + */ |
| 196 | + static final int MAX_VARIANT_DEPTH = 500; |
| 197 | + |
191 | 198 | // header bytes |
192 | 199 | static final byte HEADER_NULL = primitiveHeader(NULL); |
193 | 200 | static final byte HEADER_LONG_STRING = primitiveHeader(LONG_STR); |
@@ -851,6 +858,160 @@ static HashMap<String, Integer> getMetadataMap(ByteBuffer metadata) { |
851 | 858 | return result; |
852 | 859 | } |
853 | 860 |
|
| 861 | + /** |
| 862 | + * Bounds-checks the metadata buffer: header version, dictionary offset table and string data |
| 863 | + * region all fit within the buffer extent. It does not perform any deep checks into |
| 864 | + * the metadata itself. |
| 865 | + * |
| 866 | + * @param metadata the variant metadata buffer |
| 867 | + * @return the dictionary size |
| 868 | + * @throws IllegalArgumentException if the metadata buffer is not well-formed |
| 869 | + */ |
| 870 | + static int validateMetadata(ByteBuffer metadata) { |
| 871 | + int pos = metadata.position(); |
| 872 | + Preconditions.checkArgument(pos >= 0 && pos < metadata.limit(), "variant metadata is empty"); |
| 873 | + int header = metadata.get(pos) & 0xFF; |
| 874 | + Preconditions.checkArgument( |
| 875 | + (header & VERSION_MASK) == VERSION, "Unsupported variant metadata version: %s", header & VERSION_MASK); |
| 876 | + int offsetSize = ((header >> 6) & 0x3) + 1; |
| 877 | + long remaining = (long) metadata.limit() - pos; |
| 878 | + long offsetListStart = 1L + offsetSize; |
| 879 | + Preconditions.checkArgument(offsetListStart <= remaining, "variant metadata truncated"); |
| 880 | + int dictSize = readUnsigned(metadata, pos + 1, offsetSize); |
| 881 | + long offsetBytes = (long) (dictSize + 1) * offsetSize; |
| 882 | + long dataStart = offsetListStart + offsetBytes; |
| 883 | + Preconditions.checkArgument( |
| 884 | + dataStart <= remaining, "variant metadata dictionary table extends past buffer: dictSize=%s", dictSize); |
| 885 | + return dictSize; |
| 886 | + } |
| 887 | + |
| 888 | + /** |
| 889 | + * Bounds-checks a single Variant value node against its buffer slot. Performs no recursion |
| 890 | + * into nested children: child nodes are checked on demand when callers descend into them. |
| 891 | + * |
| 892 | + * <p>Cost: O(1) for primitives and short strings, O(numElements) for objects and arrays. |
| 893 | + * Validation of nested structures is deferred so that opening a large well-formed Variant |
| 894 | + * is not penalised by sub-trees the caller never inspects. |
| 895 | + * |
| 896 | + * @param value the variant value buffer (position/limit define the extent of this node's slot) |
| 897 | + * @param dictSize the metadata dictionary size, used to bound object field ids |
| 898 | + * @throws IllegalArgumentException if the value header or container table does not fit within |
| 899 | + * the buffer slot, or if any object field id is out of range |
| 900 | + */ |
| 901 | + static void validateValueShallow(ByteBuffer value, int dictSize) { |
| 902 | + int s = value.position(); |
| 903 | + Preconditions.checkArgument(s >= 0 && s < value.limit(), "variant value is empty"); |
| 904 | + long slot = (long) value.limit() - s; |
| 905 | + int header = value.get(s) & 0xFF; |
| 906 | + int basicType = header & BASIC_TYPE_MASK; |
| 907 | + int typeInfo = (header >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; |
| 908 | + switch (basicType) { |
| 909 | + case SHORT_STR: |
| 910 | + Preconditions.checkArgument(1L + typeInfo <= slot, "variant short string extends past buffer"); |
| 911 | + return; |
| 912 | + case OBJECT: |
| 913 | + validateContainerShallow(value, s, slot, dictSize, true, typeInfo); |
| 914 | + return; |
| 915 | + case ARRAY: |
| 916 | + validateContainerShallow(value, s, slot, dictSize, false, typeInfo); |
| 917 | + return; |
| 918 | + default: |
| 919 | + validatePrimitiveShallow(value, s, slot, typeInfo); |
| 920 | + } |
| 921 | + } |
| 922 | + |
| 923 | + private static void validateContainerShallow( |
| 924 | + ByteBuffer value, int s, long slot, int dictSize, boolean isObject, int typeInfo) { |
| 925 | + boolean largeSize; |
| 926 | + int idSize; |
| 927 | + if (isObject) { |
| 928 | + largeSize = ((typeInfo >> 4) & 0x1) != 0; |
| 929 | + idSize = ((typeInfo >> 2) & 0x3) + 1; |
| 930 | + } else { |
| 931 | + largeSize = ((typeInfo >> 2) & 0x1) != 0; |
| 932 | + idSize = 0; |
| 933 | + } |
| 934 | + int offsetSize = (typeInfo & 0x3) + 1; |
| 935 | + int sizeBytes = largeSize ? U32_SIZE : 1; |
| 936 | + Preconditions.checkArgument(1L + sizeBytes <= slot, "variant container header truncated"); |
| 937 | + int numElements = readUnsigned(value, s + 1, sizeBytes); |
| 938 | + long idStart = 1L + sizeBytes; |
| 939 | + long idBytes = isObject ? (long) numElements * idSize : 0L; |
| 940 | + long offsetStart = idStart + idBytes; |
| 941 | + long offsetBytes = (long) (numElements + 1) * offsetSize; |
| 942 | + long dataStart = offsetStart + offsetBytes; |
| 943 | + Preconditions.checkArgument( |
| 944 | + dataStart <= slot, "variant container offset table extends past buffer: numElements=%s", numElements); |
| 945 | + long dataLen = slot - dataStart; |
| 946 | + if (isObject) { |
| 947 | + for (int i = 0; i < numElements; i++) { |
| 948 | + int id = readUnsigned(value, s + (int) idStart + i * idSize, idSize); |
| 949 | + Preconditions.checkArgument( |
| 950 | + id < dictSize, "variant object key id %s out of range (dictSize=%s)", id, dictSize); |
| 951 | + } |
| 952 | + } |
| 953 | + // Each child offset must lie within the data region. Children may overlap or leave gaps; |
| 954 | + // the trailing terminator offset is range-checked for the same reason. |
| 955 | + for (int i = 0; i <= numElements; i++) { |
| 956 | + // O(elements) |
| 957 | + int off = readUnsigned(value, s + (int) offsetStart + i * offsetSize, offsetSize); |
| 958 | + Preconditions.checkArgument( |
| 959 | + off <= dataLen, "variant child offset out of range: %s (data length %s)", off, dataLen); |
| 960 | + } |
| 961 | + } |
| 962 | + |
| 963 | + private static void validatePrimitiveShallow(ByteBuffer value, int s, long slot, int typeInfo) { |
| 964 | + long size; |
| 965 | + switch (typeInfo) { |
| 966 | + case NULL: |
| 967 | + case TRUE: |
| 968 | + case FALSE: |
| 969 | + size = 1; |
| 970 | + break; |
| 971 | + case INT8: |
| 972 | + size = 2; |
| 973 | + break; |
| 974 | + case INT16: |
| 975 | + size = 3; |
| 976 | + break; |
| 977 | + case INT32: |
| 978 | + case DATE: |
| 979 | + case FLOAT: |
| 980 | + size = 5; |
| 981 | + break; |
| 982 | + case INT64: |
| 983 | + case DOUBLE: |
| 984 | + case TIMESTAMP_TZ: |
| 985 | + case TIMESTAMP_NTZ: |
| 986 | + case TIME: |
| 987 | + case TIMESTAMP_NANOS_TZ: |
| 988 | + case TIMESTAMP_NANOS_NTZ: |
| 989 | + size = 9; |
| 990 | + break; |
| 991 | + case DECIMAL4: |
| 992 | + size = 6; |
| 993 | + break; |
| 994 | + case DECIMAL8: |
| 995 | + size = 10; |
| 996 | + break; |
| 997 | + case DECIMAL16: |
| 998 | + size = 18; |
| 999 | + break; |
| 1000 | + case BINARY: |
| 1001 | + case LONG_STR: { |
| 1002 | + Preconditions.checkArgument(1L + U32_SIZE <= slot, "variant string/binary length field truncated"); |
| 1003 | + size = 1L + U32_SIZE + readUnsigned(value, s + 1, U32_SIZE); |
| 1004 | + break; |
| 1005 | + } |
| 1006 | + case UUID: |
| 1007 | + size = 1L + UUID_SIZE; |
| 1008 | + break; |
| 1009 | + default: |
| 1010 | + throw new IllegalArgumentException(String.format("Unknown primitive type in variant: %d", typeInfo)); |
| 1011 | + } |
| 1012 | + Preconditions.checkArgument(size <= slot, "variant value extends past buffer"); |
| 1013 | + } |
| 1014 | + |
854 | 1015 | /** |
855 | 1016 | * Computes the actual size (in bytes) of the Variant value. |
856 | 1017 | * @param value The Variant value binary |
|
0 commit comments