|
| 1 | +# ~/.GH/Qompass/Python/par2csv.py |
| 2 | +# ------------------------------- |
| 3 | +# Copyright (C) 2025 Qompass AI, All rights reserved |
| 4 | + |
| 5 | +import os |
| 6 | +import polars as pl |
| 7 | + |
| 8 | +def convert_nested_to_string(value): |
| 9 | + if isinstance(value, (list, tuple)): |
| 10 | + return ",".join(str(v) for v in value) |
| 11 | + elif isinstance(value, dict): |
| 12 | + return ",".join(f"{k}:{v}" for k, v in value.item()) |
| 13 | + else: |
| 14 | + return str(value) |
| 15 | + |
| 16 | +def convert_parquet_to_csv(input_file, output_file, chunk_size=100000): |
| 17 | + lf = pl.scan_parquet(input_file) |
| 18 | + |
| 19 | + header_written = False |
| 20 | + for i, series_chunk in enumerate(lf.collect(rechunk=chunk_size)): |
| 21 | + df_chunk = series_chunk.to_frame() |
| 22 | + |
| 23 | + for col in df_chunk.columns: |
| 24 | + if df_chunk[col].dtype.is_nested(): |
| 25 | + df_chunk = df_chunk.with_columns( |
| 26 | + pl.col(col).map_elements(lambda x:convert_nested_to_string(x), return_dtype=pl.Utf8) |
| 27 | + ) |
| 28 | + |
| 29 | + if not header_written: |
| 30 | + df_chunk.write_csv(output_file, include_header=True) |
| 31 | + header_written = True |
| 32 | + else: |
| 33 | + with open(output_file, 'a') as f: |
| 34 | + df_chunk.write_csv(f, include_header=False) |
| 35 | + |
| 36 | +input_parquet_file = '/d/downloads/medqar.parquet' |
| 37 | +output_csv_file = '/d/downloads/medpqar.csv' |
| 38 | + |
| 39 | +os.makedirs(os.path.dirname(output_csv_file), exist_ok=True) |
| 40 | + |
| 41 | +convert_parquet_to_csv(input_parquet_file, output_csv_file, chunk_size=100000) |
0 commit comments