Skip to content

Commit 323cbcf

Browse files
committed
update
1 parent 1972b5d commit 323cbcf

File tree

2 files changed

+44
-0
lines changed

2 files changed

+44
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
<h3> Qompass AI on Python </h3>
88

99
![Repository Views](https://komarev.com/ghpvc/?username=qompassai-python)
10+
11+
[![Making Python useful for AI datasets](https://img.youtube.com/vi/T-XGHgaJIPU/hqdefault.jpg)](https://www.youtube.com/watch?v=T-XGHgaJIPU&t=511s)
12+
1013
<a href="https://www.python.org/">
1114
<img src="https://img.shields.io/badge/Python-3776AB?style=for-the-badge&logo=python&logoColor=white" alt="Python">
1215
</a>

par2csv.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# ~/.GH/Qompass/Python/par2csv.py
2+
# -------------------------------
3+
# Copyright (C) 2025 Qompass AI, All rights reserved
4+
5+
import os
6+
import polars as pl
7+
8+
def convert_nested_to_string(value):
9+
if isinstance(value, (list, tuple)):
10+
return ",".join(str(v) for v in value)
11+
elif isinstance(value, dict):
12+
return ",".join(f"{k}:{v}" for k, v in value.item())
13+
else:
14+
return str(value)
15+
16+
def convert_parquet_to_csv(input_file, output_file, chunk_size=100000):
17+
lf = pl.scan_parquet(input_file)
18+
19+
header_written = False
20+
for i, series_chunk in enumerate(lf.collect(rechunk=chunk_size)):
21+
df_chunk = series_chunk.to_frame()
22+
23+
for col in df_chunk.columns:
24+
if df_chunk[col].dtype.is_nested():
25+
df_chunk = df_chunk.with_columns(
26+
pl.col(col).map_elements(lambda x:convert_nested_to_string(x), return_dtype=pl.Utf8)
27+
)
28+
29+
if not header_written:
30+
df_chunk.write_csv(output_file, include_header=True)
31+
header_written = True
32+
else:
33+
with open(output_file, 'a') as f:
34+
df_chunk.write_csv(f, include_header=False)
35+
36+
input_parquet_file = '/d/downloads/medqar.parquet'
37+
output_csv_file = '/d/downloads/medpqar.csv'
38+
39+
os.makedirs(os.path.dirname(output_csv_file), exist_ok=True)
40+
41+
convert_parquet_to_csv(input_parquet_file, output_csv_file, chunk_size=100000)

0 commit comments

Comments
 (0)