-
Notifications
You must be signed in to change notification settings - Fork 1.5k
GH-3585: Fix DataPageHeaderV2.num_nulls=-1 when column statistics are disabled #3586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,10 @@ class ColumnValueCollector { | |
| private SizeStatistics.Builder sizeStatisticsBuilder; | ||
| private GeospatialStatistics.Builder geospatialStatisticsBuilder; | ||
|
|
||
| // track the required `num_nulls` field in DataPageHeaderV2 | ||
| // https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift | ||
| private int nullCount; | ||
|
|
||
| ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) { | ||
| this.path = path; | ||
| this.statisticsEnabled = props.getStatisticsEnabled(path); | ||
|
|
@@ -54,6 +58,7 @@ class ColumnValueCollector { | |
| } | ||
|
|
||
| void resetPageStatistics() { | ||
| this.nullCount = 0; | ||
| this.statistics = statisticsEnabled | ||
| ? Statistics.createStats(path.getPrimitiveType()) | ||
| : Statistics.noopStats(path.getPrimitiveType()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to change
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible, but putting it outside Putting counter of |
||
|
|
@@ -68,6 +73,7 @@ void resetPageStatistics() { | |
| } | ||
|
|
||
| void writeNull(int repetitionLevel, int definitionLevel) { | ||
| ++nullCount; | ||
| statistics.incrementNumNulls(); | ||
| sizeStatisticsBuilder.add(repetitionLevel, definitionLevel); | ||
| } | ||
|
|
@@ -198,6 +204,14 @@ void finalizeColumnChunk() { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the number of null values written in the current page. | ||
| * This counter is to supply the required field `num_nulls` in DataPageHeaderV2. | ||
| */ | ||
| int getNullCount() { | ||
| return nullCount; | ||
| } | ||
|
|
||
| Statistics<?> getStatistics() { | ||
| return statistics; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -379,6 +379,7 @@ void writePage() { | |
| writePage( | ||
| pageRowCount, | ||
| valueCount, | ||
| collector.getNullCount(), | ||
| collector.getStatistics(), | ||
| collector.getSizeStatistics(), | ||
| collector.getGeospatialStatistics(), | ||
|
|
@@ -403,6 +404,7 @@ void writePage() { | |
| abstract void writePage( | ||
| int rowCount, | ||
| int valueCount, | ||
| int nullCount, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This adds a V2-only value to the shared writePage signature, so ColumnWriterV1 has to accept an unused parameter. Consider keeping nullCount on the V2 path only, either through a V2-specific helper or a protected getter from the base class. |
||
| Statistics<?> statistics, | ||
| SizeStatistics sizeStatistics, | ||
| GeospatialStatistics geospatialStatistics, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,9 +58,14 @@ | |
| import org.apache.parquet.ParquetReadOptions; | ||
| import org.apache.parquet.bytes.HeapByteBufferAllocator; | ||
| import org.apache.parquet.bytes.TrackingByteBufferAllocator; | ||
| import org.apache.parquet.column.ColumnDescriptor; | ||
| import org.apache.parquet.column.Encoding; | ||
| import org.apache.parquet.column.ParquetProperties; | ||
| import org.apache.parquet.column.ParquetProperties.WriterVersion; | ||
| import org.apache.parquet.column.page.DataPage; | ||
| import org.apache.parquet.column.page.DataPageV2; | ||
| import org.apache.parquet.column.page.PageReadStore; | ||
| import org.apache.parquet.column.page.PageReader; | ||
| import org.apache.parquet.column.values.bloomfilter.BloomFilter; | ||
| import org.apache.parquet.crypto.AesCipher; | ||
| import org.apache.parquet.crypto.ColumnEncryptionProperties; | ||
|
|
@@ -858,4 +863,68 @@ public void testNoFlushAfterException() throws Exception { | |
| FileSystem fs = file.getFileSystem(conf); | ||
| assertTrue(!fs.exists(file) || fs.getFileStatus(file).getLen() == 0); | ||
| } | ||
|
|
||
| @Test | ||
| public void testV2PageNullCountWithStatisticsDisabled() throws Exception { | ||
| // Regression test: when using PARQUET_2_0 with statistics disabled on a nullable column, | ||
| // DataPageHeaderV2.num_nulls must still contain the correct null count (not -1). | ||
| MessageType schema = Types.buildMessage() | ||
| .required(INT32) | ||
| .named("id") | ||
| .optional(BINARY) | ||
| .as(stringType()) | ||
| .named("value") | ||
| .named("test_schema"); | ||
|
|
||
| File file = temp.newFile(); | ||
| temp.delete(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should delete only the placeholder file, not the TemporaryFolder root. file.delete() is enough to free the output path and avoids changing the test fixture lifecycle. |
||
| Path path = new Path(file.getAbsolutePath()); | ||
|
|
||
| int totalRecords = 10; | ||
| int expectedNulls = 4; // records where i % 3 == 0: i=0,3,6,9 | ||
|
|
||
| // Write with PARQUET_2_0 and statistics disabled on the nullable "value" column | ||
| try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path) | ||
| .withType(schema) | ||
| .withWriterVersion(PARQUET_2_0) | ||
| .withStatisticsEnabled("value", false) | ||
| .withPageSize(1024 * 1024) // large page to keep all records in one page | ||
| .build()) { | ||
| SimpleGroupFactory factory = new SimpleGroupFactory(schema); | ||
| for (int i = 0; i < totalRecords; i++) { | ||
| Group group = factory.newGroup().append("id", i); | ||
| if (i % 3 != 0) { | ||
| group.append("value", "hello-" + i); | ||
| } | ||
| writer.write(group); | ||
| } | ||
| } | ||
|
|
||
| // Read back the page-level metadata and verify num_nulls | ||
| try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { | ||
| MessageType fileSchema = reader.getFooter().getFileMetaData().getSchema(); | ||
|
|
||
| // Find the "value" column descriptor | ||
| ColumnDescriptor valueColumn = fileSchema.getColumns().stream() | ||
| .filter(c -> c.getPath()[0].equals("value")) | ||
| .findFirst() | ||
| .orElseThrow(() -> new AssertionError("Column 'value' not found")); | ||
|
|
||
| PageReadStore rowGroup = reader.readNextRowGroup(); | ||
| PageReader pageReader = rowGroup.getPageReader(valueColumn); | ||
| DataPage page = pageReader.readPage(); | ||
|
|
||
| // Verify it's a V2 page (because we used PARQUET_2_0) | ||
| assertTrue( | ||
| "PARQUET_2_0 writer should produce DataPageV2 pages, got: " | ||
| + page.getClass().getSimpleName(), | ||
| page instanceof DataPageV2); | ||
|
|
||
| DataPageV2 pageV2 = (DataPageV2) page; | ||
| assertEquals( | ||
| "DataPageV2.num_nulls should be the actual null count even when statistics are disabled", | ||
| expectedNulls, | ||
| pageV2.getNullCount()); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we keep this counter in ColumnWriterBase instead? nullCount is page-level metadata, like valueCount and pageRowCount, while ColumnValueCollector is otherwise about optional statistics and bloom filters. Keeping it with the other page counters would make the ownership clearer.