Skip to content

Commit cc1af6e

Browse files
committed
start testing the programatically upload of the files on s3
1 parent ad18d62 commit cc1af6e

14 files changed

Lines changed: 397 additions & 77 deletions

File tree

poetry.lock

Lines changed: 30 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ bigtree = "^0.12.2"
4242

4343
pyreaddbc = { version = ">=1.1.0", optional = true }
4444
pycparser = { version = "2.21", optional = true }
45+
dotenv = "^0.9.9"
4546

4647
[tool.poetry.extras]
4748
dbc = ["pyreaddbc", "pycparser"]

pysus/api/client.py

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .ducklake import DuckLakeClient
1212
from .ftp import FTPClient
1313
from .models import BaseLocalFile, BaseRemoteFile
14+
from .extensions import Parquet
1415

1516
Base = declarative_base()
1617

@@ -108,8 +109,7 @@ def _attach_client_catalog(self, name: str, path: str):
108109
existing = conn.exec_driver_sql(q, (abs_path,)).fetchone()
109110

110111
if not existing:
111-
conn.exec_driver_sql(f"ATTACH '{abs_path}' AS {
112-
name} (READ_ONLY)")
112+
conn.exec_driver_sql(f"ATTACH '{abs_path}' AS {name} (READ_ONLY)")
113113

114114
async def __aexit__(self, exc_type, exc_val, exc_tb):
115115
if self._ducklake:
@@ -122,7 +122,7 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
122122

123123
def _get_dest_path(self, file: BaseRemoteFile) -> Path:
124124
client_name = file.client.name.lower()
125-
dataset_name = getattr(file.parent, "name", "unknown_dataset")
125+
dataset_name = file.dataset.name.lower()
126126

127127
group_name = ""
128128
if hasattr(file, "group") and file.group:
@@ -148,8 +148,7 @@ async def _update_state(
148148
):
149149
with self.Session() as session:
150150
record = (
151-
session.query(LocalFileState).filter_by(
152-
path=str(local_path)).first()
151+
session.query(LocalFileState).filter_by(path=str(local_path)).first()
153152
)
154153
if not record:
155154
record = LocalFileState(
@@ -231,36 +230,38 @@ async def download_to_parquet(
231230
file: BaseRemoteFile,
232231
token: str = None,
233232
callback: Callable[[int, int], None] = None,
234-
):
233+
) -> Parquet:
235234
local_file = await self.download(
236235
file=file,
237236
token=token,
238237
callback=callback,
239238
)
240239

241-
if hasattr(local_file, "to_parquet"):
242-
original_path = local_file.path
240+
if not hasattr(local_file, "to_parquet"):
241+
raise NotImplementedError(
242+
f"{local_file} can't be converted to Parquet",
243+
)
243244

244-
parquet_file = await local_file.to_parquet(callback=callback)
245+
original_path = local_file.path
245246

246-
await self._update_state(
247-
local_path=parquet_file.path,
248-
remote_path=file.path,
249-
client_name=file.client.name.lower(),
250-
status=DownloadStatus.COMPLETED,
251-
year=file.year,
252-
month=file.month,
253-
state=file.state,
254-
group=getattr(file.group, "name", None),
255-
)
247+
parquet_file = await local_file.to_parquet(callback=callback)
256248

257-
if original_path.exists() and original_path != parquet_file.path:
258-
original_path.unlink()
259-
await self._delete_record(str(original_path))
249+
await self._update_state(
250+
local_path=parquet_file.path,
251+
remote_path=file.path,
252+
client_name=file.client.name.lower(),
253+
status=DownloadStatus.COMPLETED,
254+
year=file.year,
255+
month=file.month,
256+
state=file.state,
257+
group=getattr(file.group, "name", None),
258+
)
260259

261-
return parquet_file
260+
if original_path.exists() and original_path != parquet_file.path:
261+
original_path.unlink()
262+
await self._delete_record(str(original_path))
262263

263-
return local_file
264+
return parquet_file
264265

265266
def get_local_hierarchy(self):
266267
with self.Session() as session:

pysus/api/dadosgov/models.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,7 @@ def path(self) -> str:
4646
def extension(self) -> str:
4747
if self.record.file_name:
4848
return pathlib.Path(self.record.file_name).suffix
49-
return pathlib.Path(
50-
self.record.url.split("/")[-1].split("?")[0]
51-
).suffix
49+
return pathlib.Path(self.record.url.split("/")[-1].split("?")[0]).suffix
5250

5351
@property
5452
def size(self) -> int:
@@ -75,9 +73,7 @@ async def _download(
7573
output: pathlib.Path | None = None,
7674
callback: Callable[[int], None] | None = None,
7775
) -> pathlib.Path:
78-
return await self.client._download_file(
79-
self, output, callback=callback
80-
)
76+
return await self.client._download_file(self, output, callback=callback)
8177

8278
async def fetch_size(self) -> int:
8379
try:
@@ -104,9 +100,13 @@ async def fetch_size(self) -> int:
104100

105101
class Group(BaseRemoteGroup):
106102
record: ConjuntoDados
107-
_formatter: Callable[[Recurso, "Group"], dict[str, Any]] | None = (
108-
PrivateAttr(default=None)
109-
)
103+
_formatter: (
104+
Callable[
105+
[Recurso, "Group"],
106+
dict[str, Any],
107+
]
108+
| None
109+
) = PrivateAttr(default=None)
110110

111111
def __init__(
112112
self,
@@ -136,10 +136,13 @@ def description(self) -> str:
136136
async def _fetch_files(self) -> list[File]:
137137
files = []
138138
for recurso in self.record.resources:
139-
metadata = (
140-
self._formatter(recurso, self) if self._formatter else {}
139+
metadata = self._formatter(recurso, self) if self._formatter else {}
140+
file = File(
141+
record=recurso,
142+
dataset=self.dataset,
143+
group=self,
144+
_metadata=metadata,
141145
)
142-
file = File(record=recurso, parent=self, _metadata=metadata)
143146
files.append(file)
144147
return files
145148

@@ -162,8 +165,6 @@ async def _fetch_content(self) -> list[Group]:
162165
for group_id in self.ids:
163166
record = await client.get_dataset(group_id)
164167
items.append(
165-
Group(
166-
record=record, dataset=self, formatter=self.formatter
167-
)
168+
Group(record=record, dataset=self, formatter=self.formatter)
168169
)
169170
return items

pysus/api/ducklake/catalog.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
Integer,
1111
String,
1212
Table,
13+
Sequence,
1314
)
1415
from sqlalchemy.orm import declarative_base, relationship
1516

@@ -42,7 +43,11 @@ class Origin(enum.Enum):
4243
class CatalogDataset(CatalogTable):
4344
__tablename__ = "datasets"
4445

45-
id = Column(Integer, primary_key=True)
46+
id = Column(
47+
Integer,
48+
Sequence("datasets_id_seq", schema="pysus"),
49+
primary_key=True,
50+
)
4651
name = Column(String, nullable=False, unique=True, index=True)
4752
long_name = Column(String, nullable=False)
4853
description = Column(String, nullable=True)
@@ -68,7 +73,11 @@ class CatalogDataset(CatalogTable):
6873
class ColumnDefinition(CatalogTable):
6974
__tablename__ = "dataset_columns"
7075

71-
id = Column(Integer, primary_key=True)
76+
id = Column(
77+
Integer,
78+
Sequence("columns_id_seq", schema="pysus"),
79+
primary_key=True,
80+
)
7281
dataset_id = Column(
7382
Integer,
7483
ForeignKey("pysus.datasets.id"),
@@ -96,7 +105,11 @@ class ColumnDefinition(CatalogTable):
96105
class DatasetGroup(CatalogTable):
97106
__tablename__ = "dataset_groups"
98107

99-
id = Column(Integer, primary_key=True)
108+
id = Column(
109+
Integer,
110+
Sequence("groups_id_seq", schema="pysus"),
111+
primary_key=True,
112+
)
100113
name = Column(String, nullable=False)
101114
dataset_id = Column(
102115
Integer,
@@ -123,7 +136,11 @@ class DatasetGroup(CatalogTable):
123136
class CatalogFile(CatalogTable):
124137
__tablename__ = "files"
125138

126-
id = Column(Integer, primary_key=True)
139+
id = Column(
140+
Integer,
141+
Sequence("files_id_seq", schema="pysus"),
142+
primary_key=True,
143+
)
127144
dataset_id = Column(
128145
Integer, ForeignKey("pysus.datasets.id"), nullable=False, index=True
129146
)
@@ -137,6 +154,7 @@ class CatalogFile(CatalogTable):
137154
size = Column(Integer, nullable=False)
138155
rows = Column(Integer, nullable=False)
139156
modified = Column(DateTime, nullable=False)
157+
origin_modified = Column(DateTime, nullable=True)
140158
sha256 = Column(String(64), nullable=True, index=True)
141159
year = Column(Integer, nullable=True, index=True)
142160
month = Column(Integer, nullable=True, index=True)
@@ -151,5 +169,6 @@ class CatalogFile(CatalogTable):
151169
__table_args__ = (
152170
Index("ix_files_dataset_group", "dataset_id", "group_id"),
153171
Index("ix_files_temporal", "year", "month"),
172+
Index("ix_files_lookup", "dataset_id", "group_id", "year", "month", "state"),
154173
{"schema": "pysus"},
155174
)

pysus/api/ducklake/client.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,14 @@ async def connect(self, force: bool = False):
155155

156156
async def close(self):
157157
if self._engine:
158-
if self._is_authenticated:
159-
await self._upload_catalog()
160-
161158
await anyio.to_thread.run_sync(self._engine.dispose)
159+
162160
self._engine = None
163161
self._Session = None
162+
163+
if self._is_authenticated:
164+
await self._upload_catalog()
165+
164166
self._s3_client = None
165167

166168
async def _download_file(

pysus/api/extensions.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,29 @@
1818
import pyarrow as pa
1919
import pyarrow.parquet as pq
2020
from pydantic import Field, PrivateAttr
21+
from dbfread import DBF as DBFReader
22+
2123
from pysus import CACHEPATH
2224
from pysus.api.models import BaseCompressedFile, BaseLocalFile, BaseTabularFile
2325

2426
from .types import FileType
2527

28+
import sys
29+
import ctypes.util
30+
2631
try:
27-
from dbfread import DBF as DBFReader
28-
from pyreaddbc import dbc2dbf
32+
LIBFFI = True
33+
if sys.platform.startswith("linux"):
34+
LIBFFI = ctypes.util.find_library("ffi") is not None
35+
36+
if LIBFFI:
37+
from pyreaddbc import dbc2dbf
2938

30-
FTP_IMPORT = True
39+
DBC_IMPORT = True
40+
else:
41+
DBC_IMPORT = False
3142
except ImportError:
32-
FTP_IMPORT = False
43+
DBC_IMPORT = False
3344

3445

3546
class File(BaseLocalFile):
@@ -160,6 +171,10 @@ def _get_reader_sync():
160171
class Parquet(BaseTabularFile):
161172
type: FileType = Field("Parquet")
162173

174+
@property
175+
def schema(self) -> pa.Schema:
176+
return pq.read_schema(self.path)
177+
163178
@property
164179
def columns(self) -> list[str]:
165180
return pq.read_schema(self.path).names
@@ -607,7 +622,9 @@ def _extract():
607622
class FTPNotImported(BaseTabularFile):
608623
type: FileType = Field(None)
609624
import_err: ClassVar[str] = """
610-
run "pip install pysus[dbc]" to handle DBC files
625+
run "pip install pysus[dbc]" to handle DBC files.
626+
Make sure you also have libffi installed on the system. It may not work
627+
on Windows
611628
"""
612629

613630
@property
@@ -647,7 +664,7 @@ class ExtensionFactory:
647664
".csv": CSV,
648665
".parquet": Parquet,
649666
".dbf": DBF,
650-
".dbc": DBC if FTP_IMPORT else FTPNotImported,
667+
".dbc": DBC if DBC_IMPORT else FTPNotImported,
651668
".pdf": PDF,
652669
".json": JSON,
653670
}

0 commit comments

Comments
 (0)