diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 04da882..2bd24ef 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,15 +1,20 @@ name: Build and Test Package on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: build-sdist: name: Build Source Dist runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install Tools @@ -18,11 +23,11 @@ jobs: - name: Source Packaging run: | python -m build --sdist - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 with: name: sdist path: 'dist/spotfire-*.tar.gz' - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 with: name: test-files path: | @@ -50,12 +55,12 @@ jobs: operating-system: ['ubuntu-latest', 'windows-latest'] fail-fast: false steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: sdist path: dist - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install Build Requirements @@ -81,7 +86,7 @@ jobs: python -m build --wheel # Move wheel out of build dir into top-level dist dir mv dist\*.whl ..\dist - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 with: name: wheel-${{ matrix.python-version }}-${{ matrix.operating-system }} path: 'dist/spotfire-*.whl' @@ -96,16 +101,16 @@ jobs: test-environment: ${{ fromJson(needs.build-sdist.outputs.test-environments) }} fail-fast: false steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: wheel-${{ matrix.python-version }}-${{ matrix.operating-system }} path: dist - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: test-files path: test-files - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install Dependencies (Linux) @@ -122,7 +127,7 @@ jobs: env: TEST_FILES_DIR: ${{ github.workspace }}/test-files/spotfire/test/files TEST_ENVIRONMENT: ${{ matrix.test-environment }} - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 if: ${{ always() }} with: name: test-results-${{ matrix.python-version }}-${{ matrix.operating-system }}-${{ matrix.test-environment }} @@ -138,14 +143,14 @@ jobs: echo -n "python-version=" >> $GITHUB_OUTPUT echo '${{ needs.build-sdist.outputs.python-versions }}' | sed -e 's/[^"]*"//' -e 's/".*//' >> $GITHUB_OUTPUT - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ steps.version.outputs.python-version }} - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: sdist path: dist - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: wheel-${{ steps.version.outputs.python-version }}-ubuntu-latest path: dist @@ -163,3 +168,35 @@ jobs: mypy spotfire cython-lint spotfire vendor find spotfire -name '*_helpers.[ch]' | xargs cpplint --repository=. + asan: + name: AddressSanitizer + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + with: + submodules: recursive + - uses: actions/setup-python@v6 + with: + python-version: '3.13' + - name: Install dependencies + run: | + pip install setuptools Cython "numpy>=2.0.0rc1" + pip install ".[polars]" + pip install html-testRunner polars pillow + - name: Rebuild extension with AddressSanitizer + env: + CFLAGS: "-fsanitize=address -fno-omit-frame-pointer -g" + LDFLAGS: "-fsanitize=address" + run: python setup.py build_ext --inplace + - name: Run tests under AddressSanitizer + run: | + LIBASAN=$(gcc -print-file-name=libasan.so) + LD_PRELOAD="$LIBASAN" PYTHONMALLOC=malloc python -m spotfire.test + env: + ASAN_OPTIONS: "detect_leaks=0:allocator_may_return_null=1:intercept_cxx_exceptions=0" + TEST_ENVIRONMENT: asan + - uses: actions/upload-artifact@v7 + if: always() + with: + name: test-results-asan + path: build/test-results/*.html diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml index 58911b8..3f03d70 100644 --- a/.github/workflows/pylint.yaml +++ b/.github/workflows/pylint.yaml @@ -7,11 +7,11 @@ jobs: name: Check Linters runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install Tools diff --git a/.github/workflows/sbom.yaml b/.github/workflows/sbom.yaml index 72094c3..45cd37e 100644 --- a/.github/workflows/sbom.yaml +++ b/.github/workflows/sbom.yaml @@ -35,7 +35,7 @@ jobs: outputs: python-versions: ${{ steps.dynamic.outputs.pythons }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Read python-versions id: dynamic run: | @@ -48,14 +48,14 @@ jobs: needs: setup runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive # needed for vendor/sbdf-c when building/installing sdist # workflow_run: reuse artifact from build.yaml — no rebuild - name: Download sdist (from workflow_run) if: github.event_name == 'workflow_run' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: sdist path: dist @@ -64,7 +64,7 @@ jobs: # push / release / workflow_dispatch: build fresh - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Build sdist @@ -118,7 +118,7 @@ jobs: --tool "trivy-${{ env.TRIVY_VERSION }}" - name: Upload SBOM artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: sbom-sdist path: spotfire-sdist.sbom.spdx.json @@ -133,14 +133,14 @@ jobs: python-version: ${{ fromJson(needs.setup.outputs.python-versions) }} fail-fast: false steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive # needed for vendor/sbdf-c when building wheel fresh # workflow_run: reuse the ubuntu wheel artifact from build.yaml — no rebuild - name: Download wheel (from workflow_run) if: github.event_name == 'workflow_run' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: wheel-${{ matrix.python-version }}-ubuntu-latest path: dist @@ -150,7 +150,7 @@ jobs: # Also download the sdist so scan-env can install from it (wheel is platform-specific) - name: Download sdist (from workflow_run) if: github.event_name == 'workflow_run' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: sdist path: dist @@ -160,7 +160,7 @@ jobs: # push / release / workflow_dispatch: build fresh on Linux - name: Set Up Python if: github.event_name != 'workflow_run' - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Build wheel @@ -221,7 +221,7 @@ jobs: --tool "trivy-${{ env.TRIVY_VERSION }}" - name: Upload SBOM artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: sbom-wheel-${{ matrix.python-version }} path: spotfire-wheel-${{ matrix.python-version }}.sbom.spdx.json @@ -234,7 +234,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download all SBOM artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: pattern: sbom-* path: all-sboms diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index d7ae63e..c2a1101 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1,4 +1,4 @@ -# cython: language_level=3 +# cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True # Copyright © 2022. Cloud Software Group, Inc. # This file is subject to the license terms contained @@ -954,14 +954,11 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type)) importer_fns[i] = _import_vts_numpy elif col_type.id == sbdf_c.SBDF_DATETIMETYPEID: - if output_format == OutputFormat.POLARS: - # Store raw int64 ms values; _import_build_polars_dataframe will adjust the - # epoch offset and reinterpret as datetime64[ms] without boxing Python objects. - importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) - importer_fns[i] = _import_vts_numpy - else: - importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type)) - importer_fns[i] = _import_vt_datetime + # Store raw int64 ms values for both Polars and Pandas paths. The Pandas + # assembly converts vectorially with pd.to_datetime(); _import_build_polars_dataframe + # adjusts the epoch offset and casts zero-copy. + importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) + importer_fns[i] = _import_vts_numpy elif col_type.id == sbdf_c.SBDF_DATETYPEID: if output_format == OutputFormat.POLARS: importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type)) @@ -970,14 +967,10 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type)) importer_fns[i] = _import_vt_date elif col_type.id == sbdf_c.SBDF_TIMESPANTYPEID: - if output_format == OutputFormat.POLARS: - # Timespans are stored as int64 ms with no epoch — reinterpret directly as - # timedelta64[ms] in _import_build_polars_dataframe. - importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) - importer_fns[i] = _import_vts_numpy - else: - importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type)) - importer_fns[i] = _import_vt_timespan + # Store raw int64 ms for both paths. Pandas assembly uses pd.to_timedelta(); + # _import_build_polars_dataframe reinterprets as Duration('ms') zero-copy. + importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) + importer_fns[i] = _import_vts_numpy elif col_type.id == sbdf_c.SBDF_TIMETYPEID: if output_format == OutputFormat.POLARS: importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) @@ -1039,8 +1032,30 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): for i in range(num_columns): values = importer_contexts[i].get_values_array() invalid_array = importer_contexts[i].get_invalid_array() + vt_id = importer_contexts[i].get_value_type_id() dtype_name = importer_contexts[i].get_pandas_dtype_name() - if dtype_name in ("Int32", "Int64"): + if vt_id == sbdf_c.SBDF_DATETIMETYPEID: + # values is int64 ms since SBDF epoch. Subtract the fixed SBDF→Unix offset, + # then reinterpret the buffer as datetime64[ms] via view() — zero-copy, no + # nanosecond conversion, and wide enough to represent the full SBDF date range + # (year 1 through 9999). Write the NaT sentinel (INT64_MIN) directly into the + # int64 buffer so NaT positions are set in a single pass without a slow second + # .loc assignment through the Pandas indexing layer. + arr_ms = values - _SBDF_TO_UNIX_EPOCH_MS + if invalid_array.any(): + arr_ms[invalid_array] = np.iinfo(np.int64).min # NaT sentinel for datetime64 + column_series = pd.Series(arr_ms.view('datetime64[ms]'), dtype='datetime64[ms]', + name=column_names[i]) + elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID: + # values is int64 ms — reinterpret directly as timedelta64[ms]; same trick as + # datetime: view() avoids any per-element conversion. NaT sentinel written + # directly to eliminate the second .loc pass. No .copy() needed: values is + # already a fresh array from np.concatenate() in get_values_array(). + if invalid_array.any(): + values[invalid_array] = np.iinfo(np.int64).min # NaT sentinel for timedelta64 + column_series = pd.Series(values.view('timedelta64[ms]'), dtype='timedelta64[ms]', + name=column_names[i]) + elif dtype_name in ("Int32", "Int64"): # Build nullable integer array with mask in one shot; avoids a second-pass # .loc assignment that triggers Pandas dtype coercion overhead. base_dtype = "int32" if dtype_name == "Int32" else "int64" @@ -1048,10 +1063,18 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): pd.arrays.IntegerArray(values.astype(base_dtype), invalid_array), name=column_names[i]) else: - column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) - column_series.loc[invalid_array] = None + if values.dtype.kind == 'O': + # Object-dtype (string) arrays can be pre-masked before Series construction, + # avoiding the pandas .loc indexing overhead on the post-construction path. + if invalid_array.any(): + values[invalid_array] = None + column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) + else: + column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) + if invalid_array.any(): + column_series.loc[invalid_array] = None imported_columns.append(column_series) - dataframe = pd.concat(imported_columns, axis=1) + dataframe = pd.DataFrame(dict(zip(column_names, imported_columns))) for i in range(num_columns): dataframe[column_names[i]].spotfire_column_metadata = column_metadata[i] dataframe[column_names[i]].attrs['spotfire_type'] = importer_contexts[i].get_spotfire_type_name() @@ -1114,6 +1137,7 @@ cdef class _ExportContext: cdef int polars_exporter_id # 0=default; 1=datetime; 2=date; 3=timespan; 4=time; 5=string cdef np_c.ndarray _arrow_offsets # int64 view of Arrow offsets buffer (string fast path) cdef np_c.ndarray _arrow_data # uint8 view of Arrow values buffer (string fast path) + cdef bint values_precomputed_sbdf_int64 # True when values_array already holds int64 SBDF-ms def __init__(self): """Initialize the export context.""" @@ -1124,6 +1148,7 @@ cdef class _ExportContext: self.polars_exporter_id = 0 self._arrow_offsets = None self._arrow_data = None + self.values_precomputed_sbdf_int64 = False cdef void set_arrays(self, np_c.ndarray values, invalid): """Set the NumPy ``ndarray`` with the values to export and a list or NumPy ``ndarray`` of whether each value @@ -1134,7 +1159,7 @@ cdef class _ExportContext: """ self.values_array = values self.invalid_array = np.asarray(invalid, dtype="bool") - self.any_invalid = any(invalid) + self.any_invalid = bool(self.invalid_array.any()) cdef void set_arrow_string(self, np_c.ndarray offsets, np_c.ndarray data, np_c.ndarray invalid): @@ -1250,12 +1275,49 @@ cdef _export_obj_dataframe(obj): pd.NA: na_value, pd.NaT: na_value, } - if obj[col].dtype == "object": + col_dtype = obj[col].dtype + invalids = pd.isnull(obj[col]) + if (context.valuetype_id == sbdf_c.SBDF_DATETIMETYPEID and col_dtype.kind == 'M' and + not hasattr(col_dtype, 'tz')): + # Pre-compute int64 SBDF-ms once so the exporter is zero-copy (no per-chunk + # alloc+copy+add). view('int64') + offset produces a new contiguous int64 array; + # NaT positions (INT64_MIN + offset, still valid int64) are zeroed here so the + # exporter can call _export_get_offset_ptr directly without further work. + raw = obj[col].to_numpy(dtype="datetime64[ms]", na_value=np.datetime64("NaT")) + values = raw.view(np.int64) + _SBDF_TO_UNIX_EPOCH_MS + if invalids.any(): + values[invalids] = 0 + context.set_arrays(values, invalids) + context.values_precomputed_sbdf_int64 = True + elif context.valuetype_id == sbdf_c.SBDF_TIMESPANTYPEID and col_dtype.kind == 'm': + # Same zero-copy pre-computation for timedelta64[ms]: int64 view IS already ms, + # no epoch offset required — just copy so we can safely zero invalid positions. + raw = obj[col].to_numpy(dtype="timedelta64[ms]", na_value=np.timedelta64("NaT")) + values = raw.view(np.int64).copy() + if invalids.any(): + values[invalids] = 0 + context.set_arrays(values, invalids) + context.values_precomputed_sbdf_int64 = True + elif context.valuetype_id == sbdf_c.SBDF_DATETYPEID and col_dtype == object: + # Pre-compute int64 SBDF-ms for date (object) columns: numpy's asarray covers + # the full year-1 to year-9999 range (pd.to_datetime silently coerces out-of- + # Timestamp-range dates to NaT). Zero null positions before multiplication to + # prevent int64 overflow from NaT's INT64_MIN sentinel. + days_dt64 = np.asarray(obj[col], dtype='datetime64[D]') + days = days_dt64.view(np.int64).copy() + if invalids.any(): + days[invalids] = 0 + values = days * 86400000 + _SBDF_TO_UNIX_EPOCH_MS + if invalids.any(): + values[invalids] = 0 + context.set_arrays(values, invalids) + context.values_precomputed_sbdf_int64 = True + elif col_dtype == "object": values = obj[col].replace(nas).to_numpy() + context.set_arrays(values, invalids) else: values = obj[col].replace(nas).to_numpy(dtype=context.get_numpy_dtype()) - invalids = pd.isnull(obj[col]) - context.set_arrays(values, invalids) + context.set_arrays(values, invalids) exporter_contexts.append(context) try: column_metadata.append(obj[col].spotfire_column_metadata) @@ -1948,24 +2010,38 @@ cdef int _export_vt_datetime(_ExportContext context, Py_ssize_t start, Py_ssize_ """Export a slice of data consisting of datetime values.""" cdef np_c.npy_intp shape[1] shape[0] = count - cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + cdef np_c.ndarray new_values cdef int i - current_tz = datetime.datetime.now().astimezone().tzinfo - for i in range(count): - if not context.invalid_array[start + i]: - val_i = context.values_array[start + i] - if isinstance(val_i, pd.Timestamp): - if val_i.tz: - dt = val_i.tz_convert(current_tz).tz_localize(None).to_pydatetime() + if context.values_precomputed_sbdf_int64: + # Zero-copy path: values_array already holds int64 SBDF-ms with invalids zeroed. + return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_datetime(), count, + _export_get_offset_ptr(context.values_array, start, count), + NULL, obj) + if context.values_array.dtype.kind == 'M': + # Fast path for tz-naive datetime64[ms]: single numpy op produces a new int64 array + # with the SBDF epoch offset applied (no separate alloc+copy+add steps). + new_values = context.values_array[start:start + count].view(np.int64) + _SBDF_TO_UNIX_EPOCH_MS + invalid_slice = context.invalid_array[start:start + count] + if invalid_slice.any(): + new_values[invalid_slice] = 0 + else: + new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + current_tz = datetime.datetime.now().astimezone().tzinfo + for i in range(count): + if not context.invalid_array[start + i]: + val_i = context.values_array[start + i] + if isinstance(val_i, pd.Timestamp): + if val_i.tz: + dt = val_i.tz_convert(current_tz).tz_localize(None).to_pydatetime() + else: + dt = val_i.to_pydatetime() + elif isinstance(val_i, np.datetime64): + dt = np.datetime64(val_i, "ms").astype(datetime.datetime) + elif isinstance(val_i, datetime.datetime): + dt = val_i else: - dt = val_i.to_pydatetime() - elif isinstance(val_i, np.datetime64): - dt = np.datetime64(val_i, "ms").astype(datetime.datetime) - elif isinstance(val_i, datetime.datetime): - dt = val_i - else: - raise SBDFError(f"cannot convert '{val_i}' to Spotfire DateTime type; incompatible types") - new_values[i] = int((dt - _DATETIME_EPOCH) / _TIMEDELTA_ONE_MSEC) + raise SBDFError(f"cannot convert '{val_i}' to Spotfire DateTime type; incompatible types") + new_values[i] = int((dt - _DATETIME_EPOCH) / _TIMEDELTA_ONE_MSEC) return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_datetime(), count, np_c.PyArray_DATA(new_values), NULL, obj) @@ -1973,8 +2049,14 @@ cdef int _export_vt_date(_ExportContext context, Py_ssize_t start, Py_ssize_t co """Export a slice of data consisting of date values.""" cdef np_c.npy_intp shape[1] shape[0] = count - cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + cdef np_c.ndarray new_values cdef int i + if context.values_precomputed_sbdf_int64: + # Zero-copy path: values_array already holds int64 SBDF-ms (midnight) with invalids zeroed. + return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_date(), count, + _export_get_offset_ptr(context.values_array, start, count), + NULL, obj) + new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) for i in range(count): if not context.invalid_array[start + i]: val_i = context.values_array[start + i] @@ -1998,10 +2080,12 @@ cdef int _export_vt_time(_ExportContext context, Py_ssize_t start, Py_ssize_t co if not context.invalid_array[start + i]: val_i = context.values_array[start + i] if isinstance(val_i, datetime.time): - val = datetime.datetime.combine(datetime.datetime.min, val_i) - datetime.datetime.min + # Direct integer arithmetic on time attributes avoids allocating a datetime + # and timedelta object per row (which datetime.combine(...) - min requires). + new_values[i] = ((val_i.hour * 3600 + val_i.minute * 60 + val_i.second) * 1000 + + val_i.microsecond // 1000) else: raise SBDFError(f"cannot convert '{val_i}' to Spotfire Time type; incompatible types") - new_values[i] = val // _TIMEDELTA_ONE_MSEC return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_time(), count, np_c.PyArray_DATA(new_values), NULL, obj) @@ -2009,32 +2093,51 @@ cdef int _export_vt_timespan(_ExportContext context, Py_ssize_t start, Py_ssize_ """Export a slice of data consisting of timespan values.""" cdef np_c.npy_intp shape[1] shape[0] = count - cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + cdef np_c.ndarray new_values cdef int i - for i in range(count): - if not context.invalid_array[start + i]: - val_i = context.values_array[start + i] - if isinstance(val_i, pd.Timedelta): - td = val_i.to_pytimedelta() - elif isinstance(val_i, np.timedelta64): - td = np.timedelta64(val_i, "ms").astype(datetime.timedelta) - elif isinstance(val_i, datetime.timedelta): - td = val_i - else: - raise SBDFError(f"cannot convert '{val_i}' to Spotfire TimeSpan type; incompatible types") - new_values[i] = int(td / _TIMEDELTA_ONE_MSEC) + if context.values_precomputed_sbdf_int64: + # Zero-copy path: values_array already holds int64 ms with invalids zeroed. + return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_timespan(), count, + _export_get_offset_ptr(context.values_array, start, count), + NULL, obj) + if context.values_array.dtype.kind == 'm': + # Fast path for timedelta64[ms]: single-op slice+view (no alloc+copy+zero triple). + new_values = context.values_array[start:start + count].view(np.int64).copy() + invalid_slice = context.invalid_array[start:start + count] + if invalid_slice.any(): + new_values[invalid_slice] = 0 + else: + new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + for i in range(count): + if not context.invalid_array[start + i]: + val_i = context.values_array[start + i] + if isinstance(val_i, pd.Timedelta): + td = val_i.to_pytimedelta() + elif isinstance(val_i, np.timedelta64): + td = np.timedelta64(val_i, "ms").astype(datetime.timedelta) + elif isinstance(val_i, datetime.timedelta): + td = val_i + else: + raise SBDFError(f"cannot convert '{val_i}' to Spotfire TimeSpan type; incompatible types") + new_values[i] = int(td / _TIMEDELTA_ONE_MSEC) return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_timespan(), count, np_c.PyArray_DATA(new_values), NULL, obj) cdef int _export_vt_string(_ExportContext context, Py_ssize_t start, Py_ssize_t count, sbdf_c.sbdf_object** obj): """Export a slice of data consisting of string values.""" - obj[0] = _export_extract_string_obj(context.values_array, context.invalid_array, start, count) + obj[0] = _export_extract_string_obj( + np_c.PyArray_DATA(context.values_array), + np_c.PyArray_DATA(context.invalid_array), + start, count) return sbdf_c.SBDF_OK cdef int _export_vt_binary(_ExportContext context, Py_ssize_t start, Py_ssize_t count, sbdf_c.sbdf_object** obj): """Export a slice of data consisting of binary values.""" - obj[0] = _export_extract_binary_obj(context.values_array, context.invalid_array, start, count) + obj[0] = _export_extract_binary_obj( + np_c.PyArray_DATA(context.values_array), + np_c.PyArray_DATA(context.invalid_array), + start, count) return sbdf_c.SBDF_OK @@ -2273,15 +2376,16 @@ cdef (int, sbdf_c.sbdf_valuearray*) _export_process_invalid_array(_ExportContext cdef inline void* _export_get_offset_ptr(np_c.ndarray array, Py_ssize_t start, Py_ssize_t count): - """Slice a NumPy ``ndarray`` using Cython memoryviews. + """Return a pointer into ``array`` at element ``start``. :param array: the NumPy array to slice :param start: the index of the first element of the slice - :param count: the number of elements to include in the slice - :return: a pointer to the memory (owned by the NumPy array) of the slice + :param count: unused; kept for call-site compatibility + :return: a pointer to element ``start`` in the array's data buffer """ - cdef np_c.ndarray sliced = array[start : start + count] - return np_c.PyArray_DATA(sliced) + cdef char *base = np_c.PyArray_DATA(array) + cdef Py_ssize_t sz = array.itemsize + return (base + start * sz) cdef sbdf_c.sbdf_metadata_head* _export_metadata(dict md, int column_num): @@ -2324,7 +2428,12 @@ cdef sbdf_c.sbdf_metadata_head* _export_metadata(dict md, int column_num): val_type.id = _export_infer_valuetype_from_type(val, f"{metadata_description} metadata '{name_str}'") if val_type.id == sbdf_c.SBDF_STRINGTYPEID: - obj = _export_extract_string_obj(val, [False] * val_len, 0, val_len) + _meta_vals = np.asarray(val, dtype=object) + _meta_inv = np.zeros(val_len, dtype=bool) + obj = _export_extract_string_obj( + np_c.PyArray_DATA(_meta_vals), + np_c.PyArray_DATA(_meta_inv), + 0, val_len) error = sbdf_c.SBDF_OK elif val_type.id == sbdf_c.SBDF_DOUBLETYPEID: data_double = mem.PyMem_RawMalloc(val_len * sizeof(double)) @@ -2384,7 +2493,12 @@ cdef sbdf_c.sbdf_metadata_head* _export_metadata(dict md, int column_num): error = sbdf_c.sbdf_obj_create_arr(val_type, val_len, data_datetime, NULL, &obj) mem.PyMem_RawFree(data_datetime) elif val_type.id == sbdf_c.SBDF_BINARYTYPEID: - obj = _export_extract_binary_obj(val, [False] * val_len, 0, val_len) + _meta_vals = np.asarray(val, dtype=object) + _meta_inv = np.zeros(val_len, dtype=bool) + obj = _export_extract_binary_obj( + np_c.PyArray_DATA(_meta_vals), + np_c.PyArray_DATA(_meta_inv), + 0, val_len) error = sbdf_c.SBDF_OK elif val_type.id == sbdf_c.SBDF_DECIMALTYPEID: data_decimal = <_SbdfDecimal*>mem.PyMem_RawMalloc(val_len * sizeof(_SbdfDecimal)) diff --git a/spotfire/sbdf_helpers.c b/spotfire/sbdf_helpers.c index ce89a23..964593c 100644 --- a/spotfire/sbdf_helpers.c +++ b/spotfire/sbdf_helpers.c @@ -80,15 +80,23 @@ void _allocated_list_done(struct _AllocatedList *alist, _allocated_dealloc_fn fu } } -/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C */ -sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count) { +/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. + * vals_ptr is PyArray_DATA() of a numpy object array; each slot is a borrowed PyObject*. + * inv_ptr is PyArray_DATA() of a numpy bool array; nonzero byte means null/invalid. + */ +sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count) { sbdf_object *t = calloc(1, sizeof(sbdf_object)); + if (!t) { + PyErr_NoMemory(); + return NULL; + } t->type = sbdf_vt_string(); t->count = (int)count; char **data = (char **)calloc(count, sizeof(char *)); if (!data) { - PyErr_Format(PyExc_MemoryError, "memory exhausted"); + PyErr_NoMemory(); sbdf_obj_destroy(t); return NULL; } @@ -96,53 +104,33 @@ sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_s for (int i = 0; i < count; i++) { Py_ssize_t idx = start + i; - PyObject *inv = PySequence_GetItem(invalids, idx); - if (inv == NULL) { - sbdf_obj_destroy(t); - return NULL; - } - if (PyObject_IsTrue(inv)) { - /* true: invalid value, add empty value to t->data */ + if (inv_ptr[idx]) { + /* null/invalid value: write empty string */ data[i] = sbdf_str_create_len("", 0); } else { - /* false: valid value, add encoded value to t->data */ - PyObject *val = PySequence_GetItem(vals, idx); - if (val == NULL) { - Py_XDECREF(inv); - sbdf_obj_destroy(t); - return NULL; - } + /* valid value: borrowed ref from numpy object array — no Py_DECREF */ + PyObject *val = (PyObject *)vals_ptr[idx]; PyObject *val_str = PyObject_Str(val); if (val_str == NULL) { - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } PyObject *val_encoded = PyObject_CallMethod(val_str, "encode", "s", "utf-8"); + Py_DECREF(val_str); if (val_encoded == NULL) { - Py_XDECREF(val_str); - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } char *val_buf; Py_ssize_t val_len; if (PyBytes_AsStringAndSize(val_encoded, &val_buf, &val_len) == -1) { - Py_XDECREF(val_encoded); - Py_XDECREF(val_str); - Py_XDECREF(val); - Py_XDECREF(inv); + Py_DECREF(val_encoded); sbdf_obj_destroy(t); return NULL; } data[i] = sbdf_str_create_len(val_buf, (int)val_len); - Py_XDECREF(val_encoded); - Py_XDECREF(val_str); - Py_XDECREF(val); + Py_DECREF(val_encoded); } - Py_XDECREF(inv); } return t; @@ -178,14 +166,19 @@ sbdf_object *_export_extract_string_obj_arrow(const char *values_buf, const int6 return t; } -sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count) { +sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count) { sbdf_object *t = calloc(1, sizeof(sbdf_object)); + if (!t) { + PyErr_NoMemory(); + return NULL; + } t->type = sbdf_vt_binary(); t->count = (int)count; unsigned char **data = (unsigned char **)calloc(count, sizeof(unsigned char *)); if (!data) { - PyErr_Format(PyExc_MemoryError, "memory exhausted"); + PyErr_NoMemory(); sbdf_obj_destroy(t); return NULL; } @@ -193,41 +186,25 @@ sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_s for (int i = 0; i < count; i++) { Py_ssize_t idx = start + i; - PyObject *inv = PySequence_GetItem(invalids, idx); - if (inv == NULL) { - sbdf_obj_destroy(t); - return NULL; - } - if (PyObject_IsTrue(inv)) { - /* true: invalid value, add empty value to t->data */ + if (inv_ptr[idx]) { + /* null/invalid value: write empty byte array */ data[i] = sbdf_ba_create(0, 0); } else { - /* false: valid value, add value to t->data */ - PyObject *val = PySequence_GetItem(vals, idx); - if (val == NULL) { - Py_XDECREF(inv); - sbdf_obj_destroy(t); - return NULL; - } + /* valid value: borrowed ref from numpy object array — no Py_DECREF */ + PyObject *val = (PyObject *)vals_ptr[idx]; if (!PyBytes_Check(val)) { PyErr_Format(PyExc_SBDFError, "cannot convert '%S' to Spotfire Binary type; incompatible types", val); - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } char *val_buf; Py_ssize_t val_len; if (PyBytes_AsStringAndSize(val, &val_buf, &val_len) == -1) { - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } data[i] = sbdf_ba_create((unsigned char *)val_buf, (int)val_len); - Py_XDECREF(val); } - Py_XDECREF(inv); } return t; diff --git a/spotfire/sbdf_helpers.h b/spotfire/sbdf_helpers.h index 04e1255..a263b01 100644 --- a/spotfire/sbdf_helpers.h +++ b/spotfire/sbdf_helpers.h @@ -35,9 +35,15 @@ struct _SbdfDecimal { unsigned char exponent_high_and_sign; }; -/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C */ -extern sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count); -extern sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count); +/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. + * vals_ptr: PyArray_DATA() of a numpy object array (array of PyObject* slots). + * inv_ptr: PyArray_DATA() of a numpy bool array (one byte per element, nonzero == null). + * Both pointers must remain valid for the duration of the call (caller holds the numpy arrays). + */ +extern sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count); +extern sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count); /* Fast string export directly from Arrow LargeUtf8 buffers: no Python str objects created. * values_buf: concatenated UTF-8 bytes from the Arrow values buffer. diff --git a/spotfire/sbdf_helpers.pxi b/spotfire/sbdf_helpers.pxi index ea719fa..c072c3b 100644 --- a/spotfire/sbdf_helpers.pxi +++ b/spotfire/sbdf_helpers.pxi @@ -21,11 +21,16 @@ cdef extern from "sbdf_helpers.h": unsigned char exponent_low unsigned char exponent_high_and_sign - # Utility functions for extracting strings from Python ``Union[str,bytes]`` into C - sbdf_c.sbdf_object* _export_extract_string_obj(object val, object invalids, Py_ssize_t start, Py_ssize_t count) \ - except NULL - sbdf_c.sbdf_object* _export_extract_binary_obj(object val, object invalids, Py_ssize_t start, Py_ssize_t count) \ - except NULL + # Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. + # vals_ptr: PyArray_DATA() of a numpy object array; inv_ptr: PyArray_DATA() of a numpy bool array. + sbdf_c.sbdf_object* _export_extract_string_obj(void **vals_ptr, + const unsigned char *inv_ptr, + Py_ssize_t start, + Py_ssize_t count) except NULL + sbdf_c.sbdf_object* _export_extract_binary_obj(void **vals_ptr, + const unsigned char *inv_ptr, + Py_ssize_t start, + Py_ssize_t count) except NULL # Fast Arrow LargeUtf8 path: no Python str objects, no re-encoding sbdf_c.sbdf_object* _export_extract_string_obj_arrow(const char *values_buf, const long long *offsets, diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 2d220ec..9f096ba 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -11,9 +11,19 @@ import pandas as pd import pandas.testing as pdtest import numpy as np -import geopandas as gpd -import matplotlib.pyplot -import seaborn +try: + import geopandas as gpd # type: ignore[import-not-found] +except ImportError: + gpd = None # type: ignore[assignment] +try: + import matplotlib # type: ignore[import-not-found] + import matplotlib.pyplot +except ImportError: + matplotlib = None # type: ignore[assignment] +try: + import seaborn # type: ignore[import-not-found] +except ImportError: + seaborn = None # type: ignore[assignment] import PIL.Image from packaging import version @@ -144,6 +154,7 @@ def test_read_10001(self): self.assertEqual(dataframe.at[10000, "String"], "kiwis") self.assertEqual(dataframe.at[10000, "Binary"], b"\x7c\x7d\x7e\x7f") + @unittest.skipIf(gpd is None, "geopandas not installed") def test_read_write_geodata(self): """Test that geo-encoded data is properly converted to/from ``GeoDataFrame``.""" gdf = sbdf.import_data(utils.get_test_data_file("sbdf/NACountries.sbdf")) @@ -475,6 +486,207 @@ def test_numpy_timedelta_resolution(self): val = df2.at[1, 'x'] self.assertEqual(val, target) + def test_temporal_nulls_roundtrip(self): + """Verify that mixed-null temporal columns survive export/import with correct positions.""" + dt = datetime.datetime + d = datetime.date + t = datetime.time + td = datetime.timedelta + + cases = { + "datetime": [dt(2020, 6, 15, 12, 0, 0), None, dt(1969, 7, 20, 20, 17, 0)], + "date": [d(2020, 6, 15), None, d(1969, 7, 20)], + "time": [t(12, 0, 0), None, t(20, 17, 0)], + "timespan": [td(days=1), None, td(seconds=30)], + } + for col_name, values in cases.items(): + with self.subTest(type=col_name): + dataframe = pd.DataFrame({"x": values}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertFalse(pd.isnull(new_df.at[0, "x"]), "row 0 should not be null") + self.assertTrue(pd.isnull(new_df.at[1, "x"]), "row 1 should be null") + self.assertFalse(pd.isnull(new_df.at[2, "x"]), "row 2 should not be null") + + def test_negative_timespans(self): + """Verify that negative timedelta values round-trip correctly.""" + cases = [ + datetime.timedelta(seconds=-1), + datetime.timedelta(days=-1), + datetime.timedelta(days=-5, seconds=300), + datetime.timedelta(milliseconds=-1), + datetime.timedelta(days=-1, seconds=86399, microseconds=999000), # -1 ms + ] + dataframe = pd.DataFrame({"x": cases}) + new_df = self._roundtrip_dataframe(dataframe) + for i, expected in enumerate(cases): + with self.subTest(i=i, value=expected): + got = new_df.at[i, "x"] + # SBDF has millisecond resolution; truncate expected to ms + expected_ms = datetime.timedelta(milliseconds=expected // datetime.timedelta(milliseconds=1)) + self.assertEqual(got, expected_ms) + + def test_pre_epoch_dates(self): + """Verify that dates before the Unix epoch (1970-01-01) round-trip correctly.""" + cases = [ + datetime.date(1, 1, 1), # SBDF epoch + datetime.date(1582, 10, 4), # day before Gregorian calendar + datetime.date(1969, 12, 31), # one day before Unix epoch + datetime.date(1970, 1, 1), # Unix epoch + datetime.date(1970, 1, 2), # one day after Unix epoch + datetime.date(9999, 12, 31), # max Python date + ] + dataframe = pd.DataFrame({"x": cases}) + new_df = self._roundtrip_dataframe(dataframe) + for i, expected in enumerate(cases): + with self.subTest(date=expected): + self.assertEqual(new_df.at[i, "x"], expected) + + def test_pre_epoch_datetimes(self): + """Verify that datetimes before the Unix epoch round-trip correctly.""" + cases = [ + datetime.datetime(1, 1, 1, 0, 0, 0), + datetime.datetime(1969, 12, 31, 23, 59, 59), + datetime.datetime(1969, 12, 31, 0, 0, 0), + ] + dataframe = pd.DataFrame({"x": cases}) + new_df = self._roundtrip_dataframe(dataframe) + for i, expected in enumerate(cases): + with self.subTest(dt=expected): + self.assertEqual(new_df.at[i, "x"], expected) + + def test_time_edge_cases(self): + """Verify midnight, end-of-day, and microsecond-precision time values.""" + cases = [ + (datetime.time(0, 0, 0), datetime.time(0, 0, 0)), # midnight + (datetime.time(23, 59, 59, 999000), datetime.time(23, 59, 59, 999000)), # end of day (ms boundary) + (datetime.time(12, 30, 45, 500), datetime.time(12, 30, 45, 0)), # sub-ms truncated + (datetime.time(0, 0, 0, 1000), datetime.time(0, 0, 0, 1000)), # 1 ms exactly + ] + for input_val, expected in cases: + with self.subTest(time=input_val): + dataframe = pd.DataFrame({"x": [input_val]}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(new_df.at[0, "x"], expected) + + def test_all_null_temporal_columns(self): + """Verify that all-null columns of each temporal type export and import without error.""" + for spotfire_type, dtype in [("DateTime", "datetime64[ms]"), + ("TimeSpan", "timedelta64[ms]")]: + with self.subTest(type=spotfire_type): + dataframe = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], # type: ignore[call-overload] + dtype=dtype)}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), 3) + self.assertTrue(new_df["x"].isna().all()) + + def test_numpy_datetime_with_nulls(self): + """Verify that numpy datetime64 columns with NaT values export and import correctly.""" + values = pd.array([ + pd.NaT, + pd.Timestamp("2020-01-01"), + pd.NaT, + pd.Timestamp("1969-07-20"), + pd.NaT, + ], dtype="datetime64[ms]") + dataframe = pd.DataFrame({"x": values}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertTrue(pd.isnull(new_df.at[0, "x"])) + self.assertEqual(new_df.at[1, "x"], datetime.datetime(2020, 1, 1)) + self.assertTrue(pd.isnull(new_df.at[2, "x"])) + self.assertEqual(new_df.at[3, "x"], datetime.datetime(1969, 7, 20)) + self.assertTrue(pd.isnull(new_df.at[4, "x"])) + + def test_numpy_timedelta_with_nulls(self): + """Verify that numpy timedelta64 columns with NaT values export and import correctly.""" + values = pd.array([ # type: ignore[call-overload] + pd.NaT, + pd.Timedelta(days=1), + pd.NaT, + pd.Timedelta(seconds=-30), + pd.NaT, + ], dtype="timedelta64[ms]") + dataframe = pd.DataFrame({"x": values}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertTrue(pd.isnull(new_df.at[0, "x"])) + self.assertEqual(new_df.at[1, "x"], datetime.timedelta(days=1)) + self.assertTrue(pd.isnull(new_df.at[2, "x"])) + self.assertEqual(new_df.at[3, "x"], datetime.timedelta(seconds=-30)) + self.assertTrue(pd.isnull(new_df.at[4, "x"])) + + def test_empty_dataframe(self): + """Verify 0-row DataFrames export and import correctly for all column types. + + Exercises the zero-size array code paths that boundscheck=False leaves unchecked, + ensuring no off-by-one occurs at the loop boundary when row_count is 0. + """ + cases = [ + ("bool", pd.DataFrame({"x": pd.array([], dtype="bool")})), + ("int64", pd.DataFrame({"x": pd.array([], dtype="int64")})), + ("float64", pd.DataFrame({"x": pd.array([], dtype="float64")})), + ("datetime64[ms]", pd.DataFrame({"x": pd.array([], dtype="datetime64[ms]")})), + ("timedelta64[ms]", pd.DataFrame({"x": pd.array([], dtype="timedelta64[ms]")})), + ] + for label, dataframe in cases: + with self.subTest(dtype=label): + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), 0) + self.assertIn("x", new_df.columns) + # String requires an explicit type annotation when the column is empty (no values to infer from) + str_df = pd.DataFrame({"x": pd.Series([], dtype=object)}) + spotfire.set_spotfire_types(str_df, {"x": "String"}) + with self.subTest(dtype="string"): + new_df = self._roundtrip_dataframe(str_df) + self.assertEqual(len(new_df), 0) + self.assertIn("x", new_df.columns) + + def test_multichunk_export(self): + """Verify exports spanning multiple SBDF row slices produce correct values. + + The default slice size is ``100_000 // num_columns`` rows, so a 100_001-row + single-column DataFrame forces a second slice (start=100_000, count=1). + This exercises _export_vt_time's direct ``[start+i]`` indexing and the + _export_get_offset_ptr pointer arithmetic for precomputed int64 paths, + both of which are unchecked under boundscheck=False. + """ + n = 100_001 + + # time: _export_vt_time accesses context.values_array[start + i] directly + times = [datetime.time(0, 0, 0)] * n + times[-1] = datetime.time(23, 59, 58) + dataframe = pd.DataFrame({"t": times}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "t"], datetime.time(0, 0, 0)) + self.assertEqual(new_df.at[n - 1, "t"], datetime.time(23, 59, 58)) + + # date: precomputed int64 via np.asarray, exported via _export_get_offset_ptr + dates = [datetime.date(2000, 1, 1)] * n + dates[-1] = datetime.date(2001, 9, 11) + dataframe = pd.DataFrame({"d": dates}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "d"], datetime.date(2000, 1, 1)) + self.assertEqual(new_df.at[n - 1, "d"], datetime.date(2001, 9, 11)) + + # datetime64[ms]: precomputed int64, exported via _export_get_offset_ptr + dts = pd.array([pd.Timestamp("2000-01-01")] * n, dtype="datetime64[ms]") + dts[-1] = pd.Timestamp("1969-07-20 20:17:40") + dataframe = pd.DataFrame({"dt": dts}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "dt"], datetime.datetime(2000, 1, 1)) + self.assertEqual(new_df.at[n - 1, "dt"], datetime.datetime(1969, 7, 20, 20, 17, 40)) + + # timedelta64[ms]: precomputed int64, exported via _export_get_offset_ptr + tds = pd.array([pd.Timedelta(0)] * n, dtype="timedelta64[ms]") + tds[-1] = pd.Timedelta(seconds=-1) + dataframe = pd.DataFrame({"td": tds}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "td"], datetime.timedelta(0)) + self.assertEqual(new_df.at[n - 1, "td"], datetime.timedelta(seconds=-1)) + + @unittest.skipIf(matplotlib is None, "matplotlib not installed") def test_image_matplot(self): """Verify Matplotlib figures export properly.""" matplotlib.pyplot.clf() @@ -487,6 +699,7 @@ def test_image_matplot(self): else: self.fail(f"Expected PNG bytes, got {type(image)}: {image!r}") + @unittest.skipIf(seaborn is None, "seaborn not installed") def test_image_seaborn(self): """Verify Seaborn grids export properly.""" matplotlib.pyplot.clf() diff --git a/test_requirements_no_polars.txt b/test_requirements_no_polars.txt deleted file mode 100644 index 73ab30d..0000000 --- a/test_requirements_no_polars.txt +++ /dev/null @@ -1,6 +0,0 @@ -html-testRunner -geopandas -matplotlib -pillow -seaborn -shapely \ No newline at end of file