From bbf59f0c8b8752fe791d2f5aa329da3ea3be2b8e Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 08:06:52 -0500 Subject: [PATCH 01/21] Perf: vectorise Pandas datetime/timespan import+export; add Cython directives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Import (Pandas path): - DateTime and TimeSpan now use _import_vts_numpy (raw int64 ms) instead of per-row Python object boxing loops (_import_vt_datetime / _import_vt_timespan). - DataFrame assembly converts with arr.view('datetime64[ms]') / arr.view('timedelta64[ms]') — zero-copy reinterpretation; supports the full SBDF date range (year 1-9999) without pd.to_datetime nanosecond overflow. Export (Pandas path): - _export_obj_dataframe stores tz-naive datetime64 columns as datetime64[ms] and timedelta64 columns as timedelta64[ms] instead of object arrays. - _export_vt_datetime fast path: view('int64') + vectorised SBDF epoch offset addition replaces per-row isinstance + .to_pydatetime() + arithmetic. - _export_vt_timespan fast path: view('int64') gives ms directly — no per-row .to_pytimedelta() or division. - Object-dtype and tz-aware columns still fall through to the per-row loop. Cython directives: - boundscheck=False, wraparound=False, cdivision=True added file-wide, eliminating runtime bounds/wrap guards in every inner loop. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 135 ++++++++++++++++++++++++++++++---------------- 1 file changed, 89 insertions(+), 46 deletions(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index d7ae63e..2a7d14d 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1,4 +1,4 @@ -# cython: language_level=3 +# cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True # Copyright © 2022. Cloud Software Group, Inc. # This file is subject to the license terms contained @@ -954,14 +954,11 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type)) importer_fns[i] = _import_vts_numpy elif col_type.id == sbdf_c.SBDF_DATETIMETYPEID: - if output_format == OutputFormat.POLARS: - # Store raw int64 ms values; _import_build_polars_dataframe will adjust the - # epoch offset and reinterpret as datetime64[ms] without boxing Python objects. - importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) - importer_fns[i] = _import_vts_numpy - else: - importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type)) - importer_fns[i] = _import_vt_datetime + # Store raw int64 ms values for both Polars and Pandas paths. The Pandas + # assembly converts vectorially with pd.to_datetime(); _import_build_polars_dataframe + # adjusts the epoch offset and casts zero-copy. + importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) + importer_fns[i] = _import_vts_numpy elif col_type.id == sbdf_c.SBDF_DATETYPEID: if output_format == OutputFormat.POLARS: importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type)) @@ -970,14 +967,10 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type)) importer_fns[i] = _import_vt_date elif col_type.id == sbdf_c.SBDF_TIMESPANTYPEID: - if output_format == OutputFormat.POLARS: - # Timespans are stored as int64 ms with no epoch — reinterpret directly as - # timedelta64[ms] in _import_build_polars_dataframe. - importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) - importer_fns[i] = _import_vts_numpy - else: - importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type)) - importer_fns[i] = _import_vt_timespan + # Store raw int64 ms for both paths. Pandas assembly uses pd.to_timedelta(); + # _import_build_polars_dataframe reinterprets as Duration('ms') zero-copy. + importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) + importer_fns[i] = _import_vts_numpy elif col_type.id == sbdf_c.SBDF_TIMETYPEID: if output_format == OutputFormat.POLARS: importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type)) @@ -1039,8 +1032,31 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): for i in range(num_columns): values = importer_contexts[i].get_values_array() invalid_array = importer_contexts[i].get_invalid_array() + vt_id = importer_contexts[i].get_value_type_id() dtype_name = importer_contexts[i].get_pandas_dtype_name() - if dtype_name in ("Int32", "Int64"): + if vt_id == sbdf_c.SBDF_DATETIMETYPEID: + # values is int64 ms since SBDF epoch. Subtract the fixed SBDF→Unix offset, + # then reinterpret the buffer as datetime64[ms] via view() — zero-copy, no + # nanosecond conversion, and wide enough to represent the full SBDF date range + # (year 1 through 9999). + arr_ms = values - _SBDF_TO_UNIX_EPOCH_MS + if invalid_array.any(): + arr_ms[invalid_array] = 0 # ensure sentinel doesn't become an invalid dt64 + column_series = pd.Series(arr_ms.view('datetime64[ms]'), dtype='datetime64[ms]', + name=column_names[i]) + if invalid_array.any(): + column_series.loc[invalid_array] = pd.NaT + elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID: + # values is int64 ms — reinterpret directly as timedelta64[ms]; same trick as + # datetime: view() avoids any per-element conversion. + arr_ms = values.copy() + if invalid_array.any(): + arr_ms[invalid_array] = 0 + column_series = pd.Series(arr_ms.view('timedelta64[ms]'), dtype='timedelta64[ms]', + name=column_names[i]) + if invalid_array.any(): + column_series.loc[invalid_array] = pd.NaT + elif dtype_name in ("Int32", "Int64"): # Build nullable integer array with mask in one shot; avoids a second-pass # .loc assignment that triggers Pandas dtype coercion overhead. base_dtype = "int32" if dtype_name == "Int32" else "int64" @@ -1250,7 +1266,16 @@ cdef _export_obj_dataframe(obj): pd.NA: na_value, pd.NaT: na_value, } - if obj[col].dtype == "object": + col_dtype = obj[col].dtype + if context.valuetype_id == sbdf_c.SBDF_DATETIMETYPEID and col_dtype.kind == 'M' and not hasattr(col_dtype, 'tz'): + # Tz-naive datetime64: store as datetime64[ms] so the exporter can use a + # vectorised view('int64') instead of per-row Python object unpacking. + values = obj[col].to_numpy(dtype="datetime64[ms]", na_value=np.datetime64("NaT")) + elif context.valuetype_id == sbdf_c.SBDF_TIMESPANTYPEID and col_dtype.kind == 'm': + # timedelta64: store as timedelta64[ms]; view('int64') in the exporter gives ms + # directly with no per-row conversion. + values = obj[col].to_numpy(dtype="timedelta64[ms]", na_value=np.timedelta64("NaT")) + elif col_dtype == "object": values = obj[col].replace(nas).to_numpy() else: values = obj[col].replace(nas).to_numpy(dtype=context.get_numpy_dtype()) @@ -1950,22 +1975,32 @@ cdef int _export_vt_datetime(_ExportContext context, Py_ssize_t start, Py_ssize_ shape[0] = count cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) cdef int i - current_tz = datetime.datetime.now().astimezone().tzinfo - for i in range(count): - if not context.invalid_array[start + i]: - val_i = context.values_array[start + i] - if isinstance(val_i, pd.Timestamp): - if val_i.tz: - dt = val_i.tz_convert(current_tz).tz_localize(None).to_pydatetime() + if context.values_array.dtype.kind == 'M': + # Fast path for tz-naive datetime64[ms]: reinterpret the buffer as int64 (ms since Unix + # epoch) and add the fixed SBDF→Unix offset. No Python object creation per row. + src_ms = context.values_array[start:start + count].view(np.int64) + new_values[:] = src_ms + new_values += _SBDF_TO_UNIX_EPOCH_MS + invalid_slice = context.invalid_array[start:start + count] + if invalid_slice.any(): + new_values[invalid_slice] = 0 + else: + current_tz = datetime.datetime.now().astimezone().tzinfo + for i in range(count): + if not context.invalid_array[start + i]: + val_i = context.values_array[start + i] + if isinstance(val_i, pd.Timestamp): + if val_i.tz: + dt = val_i.tz_convert(current_tz).tz_localize(None).to_pydatetime() + else: + dt = val_i.to_pydatetime() + elif isinstance(val_i, np.datetime64): + dt = np.datetime64(val_i, "ms").astype(datetime.datetime) + elif isinstance(val_i, datetime.datetime): + dt = val_i else: - dt = val_i.to_pydatetime() - elif isinstance(val_i, np.datetime64): - dt = np.datetime64(val_i, "ms").astype(datetime.datetime) - elif isinstance(val_i, datetime.datetime): - dt = val_i - else: - raise SBDFError(f"cannot convert '{val_i}' to Spotfire DateTime type; incompatible types") - new_values[i] = int((dt - _DATETIME_EPOCH) / _TIMEDELTA_ONE_MSEC) + raise SBDFError(f"cannot convert '{val_i}' to Spotfire DateTime type; incompatible types") + new_values[i] = int((dt - _DATETIME_EPOCH) / _TIMEDELTA_ONE_MSEC) return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_datetime(), count, np_c.PyArray_DATA(new_values), NULL, obj) @@ -2011,18 +2046,26 @@ cdef int _export_vt_timespan(_ExportContext context, Py_ssize_t start, Py_ssize_ shape[0] = count cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) cdef int i - for i in range(count): - if not context.invalid_array[start + i]: - val_i = context.values_array[start + i] - if isinstance(val_i, pd.Timedelta): - td = val_i.to_pytimedelta() - elif isinstance(val_i, np.timedelta64): - td = np.timedelta64(val_i, "ms").astype(datetime.timedelta) - elif isinstance(val_i, datetime.timedelta): - td = val_i - else: - raise SBDFError(f"cannot convert '{val_i}' to Spotfire TimeSpan type; incompatible types") - new_values[i] = int(td / _TIMEDELTA_ONE_MSEC) + if context.values_array.dtype.kind == 'm': + # Fast path for timedelta64[ms]: the int64 view is already ms — no per-row unpacking. + src_ms = context.values_array[start:start + count].view(np.int64) + new_values[:] = src_ms + invalid_slice = context.invalid_array[start:start + count] + if invalid_slice.any(): + new_values[invalid_slice] = 0 + else: + for i in range(count): + if not context.invalid_array[start + i]: + val_i = context.values_array[start + i] + if isinstance(val_i, pd.Timedelta): + td = val_i.to_pytimedelta() + elif isinstance(val_i, np.timedelta64): + td = np.timedelta64(val_i, "ms").astype(datetime.timedelta) + elif isinstance(val_i, datetime.timedelta): + td = val_i + else: + raise SBDFError(f"cannot convert '{val_i}' to Spotfire TimeSpan type; incompatible types") + new_values[i] = int(td / _TIMEDELTA_ONE_MSEC) return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_timespan(), count, np_c.PyArray_DATA(new_values), NULL, obj) From 346a150802c206fbaa45de99cca5033fcf20c4f1 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 08:32:18 -0500 Subject: [PATCH 02/21] Fix: wrap long conditional line to stay under 120 chars (E501) Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index 2a7d14d..78807c0 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1267,7 +1267,8 @@ cdef _export_obj_dataframe(obj): pd.NaT: na_value, } col_dtype = obj[col].dtype - if context.valuetype_id == sbdf_c.SBDF_DATETIMETYPEID and col_dtype.kind == 'M' and not hasattr(col_dtype, 'tz'): + if (context.valuetype_id == sbdf_c.SBDF_DATETIMETYPEID and col_dtype.kind == 'M' and + not hasattr(col_dtype, 'tz')): # Tz-naive datetime64: store as datetime64[ms] so the exporter can use a # vectorised view('int64') instead of per-row Python object unpacking. values = obj[col].to_numpy(dtype="datetime64[ms]", na_value=np.datetime64("NaT")) From 7c3535d7c4ca8a7f600af862832173537704d824 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 08:44:56 -0500 Subject: [PATCH 03/21] Perf: zero-copy Pandas datetime/timespan export; single-pass NaT import Export: pre-transform datetime64[ms]/timedelta64[ms] columns to int64 SBDF-ms once at set_arrays time so _export_vt_datetime/_export_vt_timespan can use _export_get_offset_ptr directly (zero-copy, same as numeric types) instead of allocating + copying + transforming per chunk. Retain the non-precomputed fast/slow paths for tz-aware and object-dtype columns. Import: replace the double-pass NaT handling (zero + .loc assignment) with a single write of the int64 NaT sentinel (INT64_MIN) before view(), avoiding the slow Pandas indexing layer entirely. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 75 +++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index 78807c0..5405df9 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1038,24 +1038,23 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): # values is int64 ms since SBDF epoch. Subtract the fixed SBDF→Unix offset, # then reinterpret the buffer as datetime64[ms] via view() — zero-copy, no # nanosecond conversion, and wide enough to represent the full SBDF date range - # (year 1 through 9999). + # (year 1 through 9999). Write the NaT sentinel (INT64_MIN) directly into the + # int64 buffer so NaT positions are set in a single pass without a slow second + # .loc assignment through the Pandas indexing layer. arr_ms = values - _SBDF_TO_UNIX_EPOCH_MS if invalid_array.any(): - arr_ms[invalid_array] = 0 # ensure sentinel doesn't become an invalid dt64 + arr_ms[invalid_array] = np.iinfo(np.int64).min # NaT sentinel for datetime64 column_series = pd.Series(arr_ms.view('datetime64[ms]'), dtype='datetime64[ms]', name=column_names[i]) - if invalid_array.any(): - column_series.loc[invalid_array] = pd.NaT elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID: # values is int64 ms — reinterpret directly as timedelta64[ms]; same trick as - # datetime: view() avoids any per-element conversion. + # datetime: view() avoids any per-element conversion. NaT sentinel written + # directly to eliminate the second .loc pass. arr_ms = values.copy() if invalid_array.any(): - arr_ms[invalid_array] = 0 + arr_ms[invalid_array] = np.iinfo(np.int64).min # NaT sentinel for timedelta64 column_series = pd.Series(arr_ms.view('timedelta64[ms]'), dtype='timedelta64[ms]', name=column_names[i]) - if invalid_array.any(): - column_series.loc[invalid_array] = pd.NaT elif dtype_name in ("Int32", "Int64"): # Build nullable integer array with mask in one shot; avoids a second-pass # .loc assignment that triggers Pandas dtype coercion overhead. @@ -1130,6 +1129,7 @@ cdef class _ExportContext: cdef int polars_exporter_id # 0=default; 1=datetime; 2=date; 3=timespan; 4=time; 5=string cdef np_c.ndarray _arrow_offsets # int64 view of Arrow offsets buffer (string fast path) cdef np_c.ndarray _arrow_data # uint8 view of Arrow values buffer (string fast path) + cdef bint values_precomputed_sbdf_int64 # True when values_array already holds int64 SBDF-ms def __init__(self): """Initialize the export context.""" @@ -1140,6 +1140,7 @@ cdef class _ExportContext: self.polars_exporter_id = 0 self._arrow_offsets = None self._arrow_data = None + self.values_precomputed_sbdf_int64 = False cdef void set_arrays(self, np_c.ndarray values, invalid): """Set the NumPy ``ndarray`` with the values to export and a list or NumPy ``ndarray`` of whether each value @@ -1267,21 +1268,34 @@ cdef _export_obj_dataframe(obj): pd.NaT: na_value, } col_dtype = obj[col].dtype + invalids = pd.isnull(obj[col]) if (context.valuetype_id == sbdf_c.SBDF_DATETIMETYPEID and col_dtype.kind == 'M' and not hasattr(col_dtype, 'tz')): - # Tz-naive datetime64: store as datetime64[ms] so the exporter can use a - # vectorised view('int64') instead of per-row Python object unpacking. - values = obj[col].to_numpy(dtype="datetime64[ms]", na_value=np.datetime64("NaT")) + # Pre-compute int64 SBDF-ms once so the exporter is zero-copy (no per-chunk + # alloc+copy+add). view('int64') + offset produces a new contiguous int64 array; + # NaT positions (INT64_MIN + offset, still valid int64) are zeroed here so the + # exporter can call _export_get_offset_ptr directly without further work. + raw = obj[col].to_numpy(dtype="datetime64[ms]", na_value=np.datetime64("NaT")) + values = raw.view(np.int64) + _SBDF_TO_UNIX_EPOCH_MS + if invalids.any(): + values[invalids] = 0 + context.set_arrays(values, invalids) + context.values_precomputed_sbdf_int64 = True elif context.valuetype_id == sbdf_c.SBDF_TIMESPANTYPEID and col_dtype.kind == 'm': - # timedelta64: store as timedelta64[ms]; view('int64') in the exporter gives ms - # directly with no per-row conversion. - values = obj[col].to_numpy(dtype="timedelta64[ms]", na_value=np.timedelta64("NaT")) + # Same zero-copy pre-computation for timedelta64[ms]: int64 view IS already ms, + # no epoch offset required — just copy so we can safely zero invalid positions. + raw = obj[col].to_numpy(dtype="timedelta64[ms]", na_value=np.timedelta64("NaT")) + values = raw.view(np.int64).copy() + if invalids.any(): + values[invalids] = 0 + context.set_arrays(values, invalids) + context.values_precomputed_sbdf_int64 = True elif col_dtype == "object": values = obj[col].replace(nas).to_numpy() + context.set_arrays(values, invalids) else: values = obj[col].replace(nas).to_numpy(dtype=context.get_numpy_dtype()) - invalids = pd.isnull(obj[col]) - context.set_arrays(values, invalids) + context.set_arrays(values, invalids) exporter_contexts.append(context) try: column_metadata.append(obj[col].spotfire_column_metadata) @@ -1974,18 +1988,22 @@ cdef int _export_vt_datetime(_ExportContext context, Py_ssize_t start, Py_ssize_ """Export a slice of data consisting of datetime values.""" cdef np_c.npy_intp shape[1] shape[0] = count - cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + cdef np_c.ndarray new_values cdef int i + if context.values_precomputed_sbdf_int64: + # Zero-copy path: values_array already holds int64 SBDF-ms with invalids zeroed. + return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_datetime(), count, + _export_get_offset_ptr(context.values_array, start, count), + NULL, obj) if context.values_array.dtype.kind == 'M': - # Fast path for tz-naive datetime64[ms]: reinterpret the buffer as int64 (ms since Unix - # epoch) and add the fixed SBDF→Unix offset. No Python object creation per row. - src_ms = context.values_array[start:start + count].view(np.int64) - new_values[:] = src_ms - new_values += _SBDF_TO_UNIX_EPOCH_MS + # Fast path for tz-naive datetime64[ms]: single numpy op produces a new int64 array + # with the SBDF epoch offset applied (no separate alloc+copy+add steps). + new_values = context.values_array[start:start + count].view(np.int64) + _SBDF_TO_UNIX_EPOCH_MS invalid_slice = context.invalid_array[start:start + count] if invalid_slice.any(): new_values[invalid_slice] = 0 else: + new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) current_tz = datetime.datetime.now().astimezone().tzinfo for i in range(count): if not context.invalid_array[start + i]: @@ -2045,16 +2063,21 @@ cdef int _export_vt_timespan(_ExportContext context, Py_ssize_t start, Py_ssize_ """Export a slice of data consisting of timespan values.""" cdef np_c.npy_intp shape[1] shape[0] = count - cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + cdef np_c.ndarray new_values cdef int i + if context.values_precomputed_sbdf_int64: + # Zero-copy path: values_array already holds int64 ms with invalids zeroed. + return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_timespan(), count, + _export_get_offset_ptr(context.values_array, start, count), + NULL, obj) if context.values_array.dtype.kind == 'm': - # Fast path for timedelta64[ms]: the int64 view is already ms — no per-row unpacking. - src_ms = context.values_array[start:start + count].view(np.int64) - new_values[:] = src_ms + # Fast path for timedelta64[ms]: single-op slice+view (no alloc+copy+zero triple). + new_values = context.values_array[start:start + count].view(np.int64).copy() invalid_slice = context.invalid_array[start:start + count] if invalid_slice.any(): new_values[invalid_slice] = 0 else: + new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) for i in range(count): if not context.invalid_array[start + i]: val_i = context.values_array[start + i] From b39020f1210a5f74bacace75461034f8371a17c4 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 09:09:31 -0500 Subject: [PATCH 04/21] Perf: vectorise date export; fix any(Series) hotspot; use DataFrame constructor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Export: pre-compute date (object) columns to int64 SBDF-ms via pd.to_datetime, same zero-copy approach as datetime64/timedelta64. - Export: replace any(invalid) with bool(self.invalid_array.any()) in set_arrays — the built-in any() was iterating 100k Python booleans per column; numpy any() is a single vectorised call. This alone accounts for the large numeric export gain. - Import: replace pd.concat(columns, axis=1) with pd.DataFrame(dict(...)) to skip concat's index alignment, dtype consolidation and metadata overhead. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index 5405df9..9a9cbe3 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1066,7 +1066,7 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) column_series.loc[invalid_array] = None imported_columns.append(column_series) - dataframe = pd.concat(imported_columns, axis=1) + dataframe = pd.DataFrame(dict(zip(column_names, imported_columns))) for i in range(num_columns): dataframe[column_names[i]].spotfire_column_metadata = column_metadata[i] dataframe[column_names[i]].attrs['spotfire_type'] = importer_contexts[i].get_spotfire_type_name() @@ -1151,7 +1151,7 @@ cdef class _ExportContext: """ self.values_array = values self.invalid_array = np.asarray(invalid, dtype="bool") - self.any_invalid = any(invalid) + self.any_invalid = bool(self.invalid_array.any()) cdef void set_arrow_string(self, np_c.ndarray offsets, np_c.ndarray data, np_c.ndarray invalid): @@ -1290,6 +1290,18 @@ cdef _export_obj_dataframe(obj): values[invalids] = 0 context.set_arrays(values, invalids) context.values_precomputed_sbdf_int64 = True + elif context.valuetype_id == sbdf_c.SBDF_DATETYPEID and col_dtype == object: + # Pre-compute int64 SBDF-ms for date (object) columns: pd.to_datetime iterates + # in C rather than Python, then view('int64') * 86400000 + epoch offset gives + # the same zero-copy export path as datetime64. Use day 0 (Unix epoch) as the + # na_value to keep null positions safe before zeroing them explicitly. + days = pd.to_datetime(obj[col], errors='coerce').to_numpy( + dtype='datetime64[D]', na_value=np.datetime64(0, 'D')) + values = days.view(np.int64).copy() * 86400000 + _SBDF_TO_UNIX_EPOCH_MS + if invalids.any(): + values[invalids] = 0 + context.set_arrays(values, invalids) + context.values_precomputed_sbdf_int64 = True elif col_dtype == "object": values = obj[col].replace(nas).to_numpy() context.set_arrays(values, invalids) @@ -2027,8 +2039,14 @@ cdef int _export_vt_date(_ExportContext context, Py_ssize_t start, Py_ssize_t co """Export a slice of data consisting of date values.""" cdef np_c.npy_intp shape[1] shape[0] = count - cdef np_c.ndarray new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) + cdef np_c.ndarray new_values cdef int i + if context.values_precomputed_sbdf_int64: + # Zero-copy path: values_array already holds int64 SBDF-ms (midnight) with invalids zeroed. + return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_date(), count, + _export_get_offset_ptr(context.values_array, start, count), + NULL, obj) + new_values = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0) for i in range(count): if not context.invalid_array[start + i]: val_i = context.values_array[start + i] From f9d2e63be719dcadc9fbce148ab0ba21a53db22b Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 09:27:19 -0500 Subject: [PATCH 05/21] Perf: faster time export; drop redundant timedelta copy; guard object .loc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Time export: replace datetime.combine(min, t) - min (2 Python object allocations per row) with direct integer arithmetic on time attributes. As the last unoptimized temporal column, this is the primary driver of the ~40% temporal export improvement. - Timedelta import: drop values.copy() — get_values_array() already returns a fresh array from np.concatenate(), so the explicit copy was redundant. - Object-type import (.loc): guard column_series.loc[invalid_array] = None with if invalid_array.any() — consistent with datetime/timedelta paths, avoids Pandas indexing overhead for null-free columns. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index 9a9cbe3..6baf30a 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1049,11 +1049,11 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID: # values is int64 ms — reinterpret directly as timedelta64[ms]; same trick as # datetime: view() avoids any per-element conversion. NaT sentinel written - # directly to eliminate the second .loc pass. - arr_ms = values.copy() + # directly to eliminate the second .loc pass. No .copy() needed: values is + # already a fresh array from np.concatenate() in get_values_array(). if invalid_array.any(): - arr_ms[invalid_array] = np.iinfo(np.int64).min # NaT sentinel for timedelta64 - column_series = pd.Series(arr_ms.view('timedelta64[ms]'), dtype='timedelta64[ms]', + values[invalid_array] = np.iinfo(np.int64).min # NaT sentinel for timedelta64 + column_series = pd.Series(values.view('timedelta64[ms]'), dtype='timedelta64[ms]', name=column_names[i]) elif dtype_name in ("Int32", "Int64"): # Build nullable integer array with mask in one shot; avoids a second-pass @@ -1064,7 +1064,8 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): name=column_names[i]) else: column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) - column_series.loc[invalid_array] = None + if invalid_array.any(): + column_series.loc[invalid_array] = None imported_columns.append(column_series) dataframe = pd.DataFrame(dict(zip(column_names, imported_columns))) for i in range(num_columns): @@ -2070,10 +2071,12 @@ cdef int _export_vt_time(_ExportContext context, Py_ssize_t start, Py_ssize_t co if not context.invalid_array[start + i]: val_i = context.values_array[start + i] if isinstance(val_i, datetime.time): - val = datetime.datetime.combine(datetime.datetime.min, val_i) - datetime.datetime.min + # Direct integer arithmetic on time attributes avoids allocating a datetime + # and timedelta object per row (which datetime.combine(...) - min requires). + new_values[i] = ((val_i.hour * 3600 + val_i.minute * 60 + val_i.second) * 1000 + + val_i.microsecond // 1000) else: raise SBDFError(f"cannot convert '{val_i}' to Spotfire Time type; incompatible types") - new_values[i] = val // _TIMEDELTA_ONE_MSEC return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_time(), count, np_c.PyArray_DATA(new_values), NULL, obj) From debf567d7310340003de2ae96dcc8d65535f9e89 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 09:56:18 -0500 Subject: [PATCH 06/21] Fix: use np.asarray for date export to handle full year-1..9999 range pd.to_datetime(errors='coerce') silently converts dates outside the Pandas Timestamp range (year 1, pre-Gregorian, year 9999) to NaT, then to the Unix epoch. Replace with np.asarray(..., dtype='datetime64[D]') which covers the full Python date range. Zero NaT positions (INT64_MIN) before multiplying to prevent int64 overflow. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index 6baf30a..0b25f37 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1292,13 +1292,15 @@ cdef _export_obj_dataframe(obj): context.set_arrays(values, invalids) context.values_precomputed_sbdf_int64 = True elif context.valuetype_id == sbdf_c.SBDF_DATETYPEID and col_dtype == object: - # Pre-compute int64 SBDF-ms for date (object) columns: pd.to_datetime iterates - # in C rather than Python, then view('int64') * 86400000 + epoch offset gives - # the same zero-copy export path as datetime64. Use day 0 (Unix epoch) as the - # na_value to keep null positions safe before zeroing them explicitly. - days = pd.to_datetime(obj[col], errors='coerce').to_numpy( - dtype='datetime64[D]', na_value=np.datetime64(0, 'D')) - values = days.view(np.int64).copy() * 86400000 + _SBDF_TO_UNIX_EPOCH_MS + # Pre-compute int64 SBDF-ms for date (object) columns: numpy's asarray covers + # the full year-1 to year-9999 range (pd.to_datetime silently coerces out-of- + # Timestamp-range dates to NaT). Zero null positions before multiplication to + # prevent int64 overflow from NaT's INT64_MIN sentinel. + days_dt64 = np.asarray(obj[col], dtype='datetime64[D]') + days = days_dt64.view(np.int64).copy() + if invalids.any(): + days[invalids] = 0 + values = days * 86400000 + _SBDF_TO_UNIX_EPOCH_MS if invalids.any(): values[invalids] = 0 context.set_arrays(values, invalids) From 53b93b1022e632817141b1c09386d93925bf2ffa Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 10:26:23 -0500 Subject: [PATCH 07/21] Test: add temporal edge-case coverage for optimized export/import paths Eight new test methods covering gaps exposed by the zero-copy temporal optimizations: null roundtrips, negative timespans, pre-epoch/out-of-range dates (year 1, pre-Gregorian, year 9999), pre-epoch datetimes, time edge cases (midnight, end-of-day, microsecond truncation), all-null temporal columns, and NaT at specific positions in numpy datetime64/timedelta64 arrays. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/test/test_sbdf.py | 126 +++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 2d220ec..76a3169 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -475,6 +475,132 @@ def test_numpy_timedelta_resolution(self): val = df2.at[1, 'x'] self.assertEqual(val, target) + def test_temporal_nulls_roundtrip(self): + """Verify that mixed-null temporal columns survive export/import with correct positions.""" + dt = datetime.datetime + d = datetime.date + t = datetime.time + td = datetime.timedelta + + cases = { + "datetime": [dt(2020, 6, 15, 12, 0, 0), None, dt(1969, 7, 20, 20, 17, 0)], + "date": [d(2020, 6, 15), None, d(1969, 7, 20)], + "time": [t(12, 0, 0), None, t(20, 17, 0)], + "timespan": [td(days=1), None, td(seconds=30)], + } + for col_name, values in cases.items(): + with self.subTest(type=col_name): + df = pd.DataFrame({"x": values}) + df2 = self._roundtrip_dataframe(df) + self.assertFalse(pd.isnull(df2.at[0, "x"]), "row 0 should not be null") + self.assertTrue(pd.isnull(df2.at[1, "x"]), "row 1 should be null") + self.assertFalse(pd.isnull(df2.at[2, "x"]), "row 2 should not be null") + + def test_negative_timespans(self): + """Verify that negative timedelta values round-trip correctly.""" + cases = [ + datetime.timedelta(seconds=-1), + datetime.timedelta(days=-1), + datetime.timedelta(days=-5, seconds=300), + datetime.timedelta(milliseconds=-1), + datetime.timedelta(days=-1, seconds=86399, microseconds=999000), # -1 ms + ] + df = pd.DataFrame({"x": cases}) + df2 = self._roundtrip_dataframe(df) + for i, expected in enumerate(cases): + with self.subTest(i=i, value=expected): + got = df2.at[i, "x"] + # SBDF has millisecond resolution; truncate expected to ms + expected_ms = datetime.timedelta(milliseconds=expected // datetime.timedelta(milliseconds=1)) + self.assertEqual(got, expected_ms) + + def test_pre_epoch_dates(self): + """Verify that dates before the Unix epoch (1970-01-01) round-trip correctly.""" + cases = [ + datetime.date(1, 1, 1), # SBDF epoch + datetime.date(1582, 10, 4), # day before Gregorian calendar + datetime.date(1969, 12, 31), # one day before Unix epoch + datetime.date(1970, 1, 1), # Unix epoch + datetime.date(1970, 1, 2), # one day after Unix epoch + datetime.date(9999, 12, 31), # max Python date + ] + df = pd.DataFrame({"x": cases}) + df2 = self._roundtrip_dataframe(df) + for i, expected in enumerate(cases): + with self.subTest(date=expected): + self.assertEqual(df2.at[i, "x"], expected) + + def test_pre_epoch_datetimes(self): + """Verify that datetimes before the Unix epoch round-trip correctly.""" + cases = [ + datetime.datetime(1, 1, 1, 0, 0, 0), + datetime.datetime(1969, 12, 31, 23, 59, 59), + datetime.datetime(1969, 12, 31, 0, 0, 0), + ] + df = pd.DataFrame({"x": cases}) + df2 = self._roundtrip_dataframe(df) + for i, expected in enumerate(cases): + with self.subTest(dt=expected): + self.assertEqual(df2.at[i, "x"], expected) + + def test_time_edge_cases(self): + """Verify midnight, end-of-day, and microsecond-precision time values.""" + cases = [ + (datetime.time(0, 0, 0), datetime.time(0, 0, 0)), # midnight + (datetime.time(23, 59, 59, 999000), datetime.time(23, 59, 59, 999000)), # end of day (ms boundary) + (datetime.time(12, 30, 45, 500), datetime.time(12, 30, 45, 0)), # sub-ms truncated + (datetime.time(0, 0, 0, 1000), datetime.time(0, 0, 0, 1000)), # 1 ms exactly + ] + for input_val, expected in cases: + with self.subTest(time=input_val): + df = pd.DataFrame({"x": [input_val]}) + df2 = self._roundtrip_dataframe(df) + self.assertEqual(df2.at[0, "x"], expected) + + def test_all_null_temporal_columns(self): + """Verify that all-null columns of each temporal type export and import without error.""" + for spotfire_type, dtype in [("DateTime", "datetime64[ms]"), + ("TimeSpan", "timedelta64[ms]")]: + with self.subTest(type=spotfire_type): + df = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], dtype=dtype)}) + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), 3) + self.assertTrue(df2["x"].isna().all()) + + def test_numpy_datetime_with_nulls(self): + """Verify that numpy datetime64 columns with NaT values export and import correctly.""" + values = pd.array([ + pd.NaT, + pd.Timestamp("2020-01-01"), + pd.NaT, + pd.Timestamp("1969-07-20"), + pd.NaT, + ], dtype="datetime64[ms]") + df = pd.DataFrame({"x": values}) + df2 = self._roundtrip_dataframe(df) + self.assertTrue(pd.isnull(df2.at[0, "x"])) + self.assertEqual(df2.at[1, "x"], datetime.datetime(2020, 1, 1)) + self.assertTrue(pd.isnull(df2.at[2, "x"])) + self.assertEqual(df2.at[3, "x"], datetime.datetime(1969, 7, 20)) + self.assertTrue(pd.isnull(df2.at[4, "x"])) + + def test_numpy_timedelta_with_nulls(self): + """Verify that numpy timedelta64 columns with NaT values export and import correctly.""" + values = pd.array([ + pd.NaT, + pd.Timedelta(days=1), + pd.NaT, + pd.Timedelta(seconds=-30), + pd.NaT, + ], dtype="timedelta64[ms]") + df = pd.DataFrame({"x": values}) + df2 = self._roundtrip_dataframe(df) + self.assertTrue(pd.isnull(df2.at[0, "x"])) + self.assertEqual(df2.at[1, "x"], datetime.timedelta(days=1)) + self.assertTrue(pd.isnull(df2.at[2, "x"])) + self.assertEqual(df2.at[3, "x"], datetime.timedelta(seconds=-30)) + self.assertTrue(pd.isnull(df2.at[4, "x"])) + def test_image_matplot(self): """Verify Matplotlib figures export properly.""" matplotlib.pyplot.clf() From b74724c3d7968ebdcc8e74c57ceb93630923e618 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 10:43:36 -0500 Subject: [PATCH 08/21] Test: add empty-DataFrame and multi-chunk export tests for bounds safety Two new tests targeting the boundscheck=False Cython directives: - test_empty_dataframe: exercises every column type with 0 rows, verifying that zero-iteration export loops don't crash or corrupt memory. - test_multichunk_export: exports 100_001 rows (one more than the default 100_000-row slice size) and checks values at both the first row and the chunk boundary (row 100_000). Covers _export_vt_time's direct [start+i] indexing and _export_get_offset_ptr for the precomputed int64 paths. - test_polars_string_multichunk: same chunk-boundary check for the Polars Arrow buffer path in _export_extract_string_obj_arrow, which does raw C pointer arithmetic into the values buffer. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/test/test_sbdf.py | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 76a3169..b29f5da 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -601,6 +601,79 @@ def test_numpy_timedelta_with_nulls(self): self.assertEqual(df2.at[3, "x"], datetime.timedelta(seconds=-30)) self.assertTrue(pd.isnull(df2.at[4, "x"])) + def test_empty_dataframe(self): + """Verify 0-row DataFrames export and import correctly for all column types. + + Exercises the zero-size array code paths that boundscheck=False leaves unchecked, + ensuring no off-by-one occurs at the loop boundary when row_count is 0. + """ + cases = [ + ("bool", pd.DataFrame({"x": pd.array([], dtype="bool")})), + ("int64", pd.DataFrame({"x": pd.array([], dtype="int64")})), + ("float64", pd.DataFrame({"x": pd.array([], dtype="float64")})), + ("datetime64[ms]", pd.DataFrame({"x": pd.array([], dtype="datetime64[ms]")})), + ("timedelta64[ms]", pd.DataFrame({"x": pd.array([], dtype="timedelta64[ms]")})), + ] + for label, df in cases: + with self.subTest(dtype=label): + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), 0) + self.assertIn("x", df2.columns) + # String requires an explicit type annotation when the column is empty (no values to infer from) + df = pd.DataFrame({"x": pd.Series([], dtype=object)}) + spotfire.set_spotfire_types(df, {"x": "String"}) + with self.subTest(dtype="string"): + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), 0) + self.assertIn("x", df2.columns) + + def test_multichunk_export(self): + """Verify exports spanning multiple SBDF row slices produce correct values. + + The default slice size is ``100_000 // num_columns`` rows, so a 100_001-row + single-column DataFrame forces a second slice (start=100_000, count=1). + This exercises _export_vt_time's direct ``[start+i]`` indexing and the + _export_get_offset_ptr pointer arithmetic for precomputed int64 paths, + both of which are unchecked under boundscheck=False. + """ + n = 100_001 + + # time: _export_vt_time accesses context.values_array[start + i] directly + times = [datetime.time(0, 0, 0)] * n + times[-1] = datetime.time(23, 59, 58) + df = pd.DataFrame({"t": times}) + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), n) + self.assertEqual(df2.at[0, "t"], datetime.time(0, 0, 0)) + self.assertEqual(df2.at[n - 1, "t"], datetime.time(23, 59, 58)) + + # date: precomputed int64 via np.asarray, exported via _export_get_offset_ptr + dates = [datetime.date(2000, 1, 1)] * n + dates[-1] = datetime.date(2001, 9, 11) + df = pd.DataFrame({"d": dates}) + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), n) + self.assertEqual(df2.at[0, "d"], datetime.date(2000, 1, 1)) + self.assertEqual(df2.at[n - 1, "d"], datetime.date(2001, 9, 11)) + + # datetime64[ms]: precomputed int64, exported via _export_get_offset_ptr + dts = pd.array([pd.Timestamp("2000-01-01")] * n, dtype="datetime64[ms]") + dts[-1] = pd.Timestamp("1969-07-20 20:17:40") + df = pd.DataFrame({"dt": dts}) + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), n) + self.assertEqual(df2.at[0, "dt"], datetime.datetime(2000, 1, 1)) + self.assertEqual(df2.at[n - 1, "dt"], datetime.datetime(1969, 7, 20, 20, 17, 40)) + + # timedelta64[ms]: precomputed int64, exported via _export_get_offset_ptr + tds = pd.array([pd.Timedelta(0)] * n, dtype="timedelta64[ms]") + tds[-1] = pd.Timedelta(seconds=-1) + df = pd.DataFrame({"td": tds}) + df2 = self._roundtrip_dataframe(df) + self.assertEqual(len(df2), n) + self.assertEqual(df2.at[0, "td"], datetime.timedelta(0)) + self.assertEqual(df2.at[n - 1, "td"], datetime.timedelta(seconds=-1)) + def test_image_matplot(self): """Verify Matplotlib figures export properly.""" matplotlib.pyplot.clf() From b188627b1dd75111b1d101714126514ef0f06880 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 10:53:20 -0500 Subject: [PATCH 09/21] Revert "CI: add no_polars test environment to verify package works without polars/pyarrow" This reverts commit 681a67d39a1712cf022cf65b8247739378589c7e. --- test_requirements_no_polars.txt | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 test_requirements_no_polars.txt diff --git a/test_requirements_no_polars.txt b/test_requirements_no_polars.txt deleted file mode 100644 index 73ab30d..0000000 --- a/test_requirements_no_polars.txt +++ /dev/null @@ -1,6 +0,0 @@ -html-testRunner -geopandas -matplotlib -pillow -seaborn -shapely \ No newline at end of file From 128de8651ff8cf215a14e889466cc0b74d141809 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 10:57:27 -0500 Subject: [PATCH 10/21] CI: add AddressSanitizer job to catch out-of-bounds access in native extension Compiles sbdf.pyx with -fsanitize=address -fno-omit-frame-pointer and runs the full test suite under LD_PRELOAD=libasan.so with PYTHONMALLOC=malloc. This provides runtime detection of heap buffer overflows that boundscheck=False and the raw C pointer arithmetic in sbdf_helpers.c leave unchecked at the Python level. detect_leaks=0 suppresses intentional Python allocator "leaks". Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/build.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 04da882..815677c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -163,3 +163,35 @@ jobs: mypy spotfire cython-lint spotfire vendor find spotfire -name '*_helpers.[ch]' | xargs cpplint --repository=. + asan: + name: AddressSanitizer + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install dependencies + run: | + pip install setuptools Cython "numpy>=2.0.0rc1" + pip install ".[polars]" + pip install -r test_requirements_default.txt + - name: Rebuild extension with AddressSanitizer + env: + CFLAGS: "-fsanitize=address -fno-omit-frame-pointer -g" + LDFLAGS: "-fsanitize=address" + run: python setup.py build_ext --inplace + - name: Run tests under AddressSanitizer + run: | + LIBASAN=$(gcc -print-file-name=libasan.so) + LD_PRELOAD="$LIBASAN" PYTHONMALLOC=malloc python -m spotfire.test + env: + ASAN_OPTIONS: "detect_leaks=0:allocator_may_return_null=1" + TEST_ENVIRONMENT: asan + - uses: actions/upload-artifact@v4 + if: always() + with: + name: test-results-asan + path: build/test-results/*.html From 0319d658d534fa01a99cf204f31d132a8bed163b Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 11:01:28 -0500 Subject: [PATCH 11/21] Fix: rename df/df2 variables to satisfy pylint invalid-name rule (min 3 chars) Co-Authored-By: Claude Sonnet 4.6 --- spotfire/test/test_sbdf.py | 128 ++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index b29f5da..8bf77c4 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -490,11 +490,11 @@ def test_temporal_nulls_roundtrip(self): } for col_name, values in cases.items(): with self.subTest(type=col_name): - df = pd.DataFrame({"x": values}) - df2 = self._roundtrip_dataframe(df) - self.assertFalse(pd.isnull(df2.at[0, "x"]), "row 0 should not be null") - self.assertTrue(pd.isnull(df2.at[1, "x"]), "row 1 should be null") - self.assertFalse(pd.isnull(df2.at[2, "x"]), "row 2 should not be null") + dataframe = pd.DataFrame({"x": values}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertFalse(pd.isnull(new_df.at[0, "x"]), "row 0 should not be null") + self.assertTrue(pd.isnull(new_df.at[1, "x"]), "row 1 should be null") + self.assertFalse(pd.isnull(new_df.at[2, "x"]), "row 2 should not be null") def test_negative_timespans(self): """Verify that negative timedelta values round-trip correctly.""" @@ -505,11 +505,11 @@ def test_negative_timespans(self): datetime.timedelta(milliseconds=-1), datetime.timedelta(days=-1, seconds=86399, microseconds=999000), # -1 ms ] - df = pd.DataFrame({"x": cases}) - df2 = self._roundtrip_dataframe(df) + dataframe = pd.DataFrame({"x": cases}) + new_df = self._roundtrip_dataframe(dataframe) for i, expected in enumerate(cases): with self.subTest(i=i, value=expected): - got = df2.at[i, "x"] + got = new_df.at[i, "x"] # SBDF has millisecond resolution; truncate expected to ms expected_ms = datetime.timedelta(milliseconds=expected // datetime.timedelta(milliseconds=1)) self.assertEqual(got, expected_ms) @@ -524,11 +524,11 @@ def test_pre_epoch_dates(self): datetime.date(1970, 1, 2), # one day after Unix epoch datetime.date(9999, 12, 31), # max Python date ] - df = pd.DataFrame({"x": cases}) - df2 = self._roundtrip_dataframe(df) + dataframe = pd.DataFrame({"x": cases}) + new_df = self._roundtrip_dataframe(dataframe) for i, expected in enumerate(cases): with self.subTest(date=expected): - self.assertEqual(df2.at[i, "x"], expected) + self.assertEqual(new_df.at[i, "x"], expected) def test_pre_epoch_datetimes(self): """Verify that datetimes before the Unix epoch round-trip correctly.""" @@ -537,11 +537,11 @@ def test_pre_epoch_datetimes(self): datetime.datetime(1969, 12, 31, 23, 59, 59), datetime.datetime(1969, 12, 31, 0, 0, 0), ] - df = pd.DataFrame({"x": cases}) - df2 = self._roundtrip_dataframe(df) + dataframe = pd.DataFrame({"x": cases}) + new_df = self._roundtrip_dataframe(dataframe) for i, expected in enumerate(cases): with self.subTest(dt=expected): - self.assertEqual(df2.at[i, "x"], expected) + self.assertEqual(new_df.at[i, "x"], expected) def test_time_edge_cases(self): """Verify midnight, end-of-day, and microsecond-precision time values.""" @@ -553,19 +553,19 @@ def test_time_edge_cases(self): ] for input_val, expected in cases: with self.subTest(time=input_val): - df = pd.DataFrame({"x": [input_val]}) - df2 = self._roundtrip_dataframe(df) - self.assertEqual(df2.at[0, "x"], expected) + dataframe = pd.DataFrame({"x": [input_val]}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(new_df.at[0, "x"], expected) def test_all_null_temporal_columns(self): """Verify that all-null columns of each temporal type export and import without error.""" for spotfire_type, dtype in [("DateTime", "datetime64[ms]"), ("TimeSpan", "timedelta64[ms]")]: with self.subTest(type=spotfire_type): - df = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], dtype=dtype)}) - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), 3) - self.assertTrue(df2["x"].isna().all()) + dataframe = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], dtype=dtype)}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), 3) + self.assertTrue(new_df["x"].isna().all()) def test_numpy_datetime_with_nulls(self): """Verify that numpy datetime64 columns with NaT values export and import correctly.""" @@ -576,13 +576,13 @@ def test_numpy_datetime_with_nulls(self): pd.Timestamp("1969-07-20"), pd.NaT, ], dtype="datetime64[ms]") - df = pd.DataFrame({"x": values}) - df2 = self._roundtrip_dataframe(df) - self.assertTrue(pd.isnull(df2.at[0, "x"])) - self.assertEqual(df2.at[1, "x"], datetime.datetime(2020, 1, 1)) - self.assertTrue(pd.isnull(df2.at[2, "x"])) - self.assertEqual(df2.at[3, "x"], datetime.datetime(1969, 7, 20)) - self.assertTrue(pd.isnull(df2.at[4, "x"])) + dataframe = pd.DataFrame({"x": values}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertTrue(pd.isnull(new_df.at[0, "x"])) + self.assertEqual(new_df.at[1, "x"], datetime.datetime(2020, 1, 1)) + self.assertTrue(pd.isnull(new_df.at[2, "x"])) + self.assertEqual(new_df.at[3, "x"], datetime.datetime(1969, 7, 20)) + self.assertTrue(pd.isnull(new_df.at[4, "x"])) def test_numpy_timedelta_with_nulls(self): """Verify that numpy timedelta64 columns with NaT values export and import correctly.""" @@ -593,13 +593,13 @@ def test_numpy_timedelta_with_nulls(self): pd.Timedelta(seconds=-30), pd.NaT, ], dtype="timedelta64[ms]") - df = pd.DataFrame({"x": values}) - df2 = self._roundtrip_dataframe(df) - self.assertTrue(pd.isnull(df2.at[0, "x"])) - self.assertEqual(df2.at[1, "x"], datetime.timedelta(days=1)) - self.assertTrue(pd.isnull(df2.at[2, "x"])) - self.assertEqual(df2.at[3, "x"], datetime.timedelta(seconds=-30)) - self.assertTrue(pd.isnull(df2.at[4, "x"])) + dataframe = pd.DataFrame({"x": values}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertTrue(pd.isnull(new_df.at[0, "x"])) + self.assertEqual(new_df.at[1, "x"], datetime.timedelta(days=1)) + self.assertTrue(pd.isnull(new_df.at[2, "x"])) + self.assertEqual(new_df.at[3, "x"], datetime.timedelta(seconds=-30)) + self.assertTrue(pd.isnull(new_df.at[4, "x"])) def test_empty_dataframe(self): """Verify 0-row DataFrames export and import correctly for all column types. @@ -614,18 +614,18 @@ def test_empty_dataframe(self): ("datetime64[ms]", pd.DataFrame({"x": pd.array([], dtype="datetime64[ms]")})), ("timedelta64[ms]", pd.DataFrame({"x": pd.array([], dtype="timedelta64[ms]")})), ] - for label, df in cases: + for label, dataframe in cases: with self.subTest(dtype=label): - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), 0) - self.assertIn("x", df2.columns) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), 0) + self.assertIn("x", new_df.columns) # String requires an explicit type annotation when the column is empty (no values to infer from) - df = pd.DataFrame({"x": pd.Series([], dtype=object)}) - spotfire.set_spotfire_types(df, {"x": "String"}) + str_df = pd.DataFrame({"x": pd.Series([], dtype=object)}) + spotfire.set_spotfire_types(str_df, {"x": "String"}) with self.subTest(dtype="string"): - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), 0) - self.assertIn("x", df2.columns) + new_df = self._roundtrip_dataframe(str_df) + self.assertEqual(len(new_df), 0) + self.assertIn("x", new_df.columns) def test_multichunk_export(self): """Verify exports spanning multiple SBDF row slices produce correct values. @@ -641,38 +641,38 @@ def test_multichunk_export(self): # time: _export_vt_time accesses context.values_array[start + i] directly times = [datetime.time(0, 0, 0)] * n times[-1] = datetime.time(23, 59, 58) - df = pd.DataFrame({"t": times}) - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), n) - self.assertEqual(df2.at[0, "t"], datetime.time(0, 0, 0)) - self.assertEqual(df2.at[n - 1, "t"], datetime.time(23, 59, 58)) + dataframe = pd.DataFrame({"t": times}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "t"], datetime.time(0, 0, 0)) + self.assertEqual(new_df.at[n - 1, "t"], datetime.time(23, 59, 58)) # date: precomputed int64 via np.asarray, exported via _export_get_offset_ptr dates = [datetime.date(2000, 1, 1)] * n dates[-1] = datetime.date(2001, 9, 11) - df = pd.DataFrame({"d": dates}) - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), n) - self.assertEqual(df2.at[0, "d"], datetime.date(2000, 1, 1)) - self.assertEqual(df2.at[n - 1, "d"], datetime.date(2001, 9, 11)) + dataframe = pd.DataFrame({"d": dates}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "d"], datetime.date(2000, 1, 1)) + self.assertEqual(new_df.at[n - 1, "d"], datetime.date(2001, 9, 11)) # datetime64[ms]: precomputed int64, exported via _export_get_offset_ptr dts = pd.array([pd.Timestamp("2000-01-01")] * n, dtype="datetime64[ms]") dts[-1] = pd.Timestamp("1969-07-20 20:17:40") - df = pd.DataFrame({"dt": dts}) - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), n) - self.assertEqual(df2.at[0, "dt"], datetime.datetime(2000, 1, 1)) - self.assertEqual(df2.at[n - 1, "dt"], datetime.datetime(1969, 7, 20, 20, 17, 40)) + dataframe = pd.DataFrame({"dt": dts}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "dt"], datetime.datetime(2000, 1, 1)) + self.assertEqual(new_df.at[n - 1, "dt"], datetime.datetime(1969, 7, 20, 20, 17, 40)) # timedelta64[ms]: precomputed int64, exported via _export_get_offset_ptr tds = pd.array([pd.Timedelta(0)] * n, dtype="timedelta64[ms]") tds[-1] = pd.Timedelta(seconds=-1) - df = pd.DataFrame({"td": tds}) - df2 = self._roundtrip_dataframe(df) - self.assertEqual(len(df2), n) - self.assertEqual(df2.at[0, "td"], datetime.timedelta(0)) - self.assertEqual(df2.at[n - 1, "td"], datetime.timedelta(seconds=-1)) + dataframe = pd.DataFrame({"td": tds}) + new_df = self._roundtrip_dataframe(dataframe) + self.assertEqual(len(new_df), n) + self.assertEqual(new_df.at[0, "td"], datetime.timedelta(0)) + self.assertEqual(new_df.at[n - 1, "td"], datetime.timedelta(seconds=-1)) def test_image_matplot(self): """Verify Matplotlib figures export properly.""" From ca99bf7d57fabe011115cafda394fc098111a62c Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 11:11:18 -0500 Subject: [PATCH 12/21] Fix: add intercept_cxx_exceptions=0 to ASAN_OPTIONS to suppress matplotlib false positive When using LD_PRELOAD ASan injection with a non-ASan-compiled Python, ASan's __cxa_throw interceptor is never initialized. matplotlib's ft2font.so throws a C++ exception during import, hitting the uninitialized interceptor and causing a CHECK failure. intercept_cxx_exceptions=0 disables the interceptor entirely; sbdf.pyx generates no C++ exceptions so there is no loss of coverage. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 815677c..4b4b533 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -188,7 +188,7 @@ jobs: LIBASAN=$(gcc -print-file-name=libasan.so) LD_PRELOAD="$LIBASAN" PYTHONMALLOC=malloc python -m spotfire.test env: - ASAN_OPTIONS: "detect_leaks=0:allocator_may_return_null=1" + ASAN_OPTIONS: "detect_leaks=0:allocator_may_return_null=1:intercept_cxx_exceptions=0" TEST_ENVIRONMENT: asan - uses: actions/upload-artifact@v4 if: always() From eff19418ba4492c0dbb7f2740f96af57f4c8387f Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 11:20:19 -0500 Subject: [PATCH 13/21] Fix: type: ignore for pd.array NaT overloads; pin ASan job to Python 3.13 mypy: pd.array() with list[NaTType] or list[NaT|Timedelta] and a string dtype has no matching overload in pandas-stubs. Add type: ignore[call-overload] on the two affected lines in test_all_null_temporal_columns and test_numpy_timedelta_with_nulls. ASan: Python 3.14 (beta) triggers a CHECK failure in asan_interceptors.cpp when ft2font.so throws a C++ exception, even with intercept_cxx_exceptions=0. Pin the ASan job to Python 3.13 where LD_PRELOAD ASan injection works cleanly. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/build.yaml | 2 +- spotfire/test/test_sbdf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 4b4b533..c115cb4 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -172,7 +172,7 @@ jobs: submodules: recursive - uses: actions/setup-python@v5 with: - python-version: '3.x' + python-version: '3.13' - name: Install dependencies run: | pip install setuptools Cython "numpy>=2.0.0rc1" diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 8bf77c4..4228537 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -562,7 +562,7 @@ def test_all_null_temporal_columns(self): for spotfire_type, dtype in [("DateTime", "datetime64[ms]"), ("TimeSpan", "timedelta64[ms]")]: with self.subTest(type=spotfire_type): - dataframe = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], dtype=dtype)}) + dataframe = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], dtype=dtype)}) # type: ignore[call-overload] new_df = self._roundtrip_dataframe(dataframe) self.assertEqual(len(new_df), 3) self.assertTrue(new_df["x"].isna().all()) @@ -586,7 +586,7 @@ def test_numpy_datetime_with_nulls(self): def test_numpy_timedelta_with_nulls(self): """Verify that numpy timedelta64 columns with NaT values export and import correctly.""" - values = pd.array([ + values = pd.array([ # type: ignore[call-overload] pd.NaT, pd.Timedelta(days=1), pd.NaT, From acb1054ad01cdeb3fdabef1e729c4383db17471e Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 11:32:50 -0500 Subject: [PATCH 14/21] CI: fix ASan crash by dropping pybind11 packages; bump actions to Node.js 24; fix line-too-long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ASan job: replace test_requirements_default.txt with html-testRunner + polars + pillow. matplotlib/seaborn/geopandas/shapely use pybind11 C++ extensions that throw exceptions, crashing LD_PRELOAD libasan injection (intercept_cxx_exceptions=0 doesn't help here). pillow is plain C — safe to keep for PIL image export ASan coverage. - Bump GitHub Actions to Node.js 24: checkout v4→v5, setup-python v5→v6, upload-artifact v4→v7, download-artifact v4→v8. - Fix pylint line-too-long (127>120) in test_sbdf.py line 565. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/build.yaml | 38 ++++++++++++++++++----------------- .github/workflows/pylint.yaml | 4 ++-- .github/workflows/sbom.yaml | 22 ++++++++++---------- spotfire/test/test_sbdf.py | 3 ++- 4 files changed, 35 insertions(+), 32 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c115cb4..984335e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,15 +1,17 @@ name: Build and Test Package on: [push, pull_request] +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: build-sdist: name: Build Source Dist runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install Tools @@ -18,11 +20,11 @@ jobs: - name: Source Packaging run: | python -m build --sdist - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 with: name: sdist path: 'dist/spotfire-*.tar.gz' - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 with: name: test-files path: | @@ -50,12 +52,12 @@ jobs: operating-system: ['ubuntu-latest', 'windows-latest'] fail-fast: false steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: sdist path: dist - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install Build Requirements @@ -81,7 +83,7 @@ jobs: python -m build --wheel # Move wheel out of build dir into top-level dist dir mv dist\*.whl ..\dist - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 with: name: wheel-${{ matrix.python-version }}-${{ matrix.operating-system }} path: 'dist/spotfire-*.whl' @@ -96,16 +98,16 @@ jobs: test-environment: ${{ fromJson(needs.build-sdist.outputs.test-environments) }} fail-fast: false steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: wheel-${{ matrix.python-version }}-${{ matrix.operating-system }} path: dist - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: test-files path: test-files - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install Dependencies (Linux) @@ -122,7 +124,7 @@ jobs: env: TEST_FILES_DIR: ${{ github.workspace }}/test-files/spotfire/test/files TEST_ENVIRONMENT: ${{ matrix.test-environment }} - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 if: ${{ always() }} with: name: test-results-${{ matrix.python-version }}-${{ matrix.operating-system }}-${{ matrix.test-environment }} @@ -138,14 +140,14 @@ jobs: echo -n "python-version=" >> $GITHUB_OUTPUT echo '${{ needs.build-sdist.outputs.python-versions }}' | sed -e 's/[^"]*"//' -e 's/".*//' >> $GITHUB_OUTPUT - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ steps.version.outputs.python-version }} - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: sdist path: dist - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: name: wheel-${{ steps.version.outputs.python-version }}-ubuntu-latest path: dist @@ -167,17 +169,17 @@ jobs: name: AddressSanitizer runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: '3.13' - name: Install dependencies run: | pip install setuptools Cython "numpy>=2.0.0rc1" pip install ".[polars]" - pip install -r test_requirements_default.txt + pip install html-testRunner polars pillow - name: Rebuild extension with AddressSanitizer env: CFLAGS: "-fsanitize=address -fno-omit-frame-pointer -g" @@ -190,7 +192,7 @@ jobs: env: ASAN_OPTIONS: "detect_leaks=0:allocator_may_return_null=1:intercept_cxx_exceptions=0" TEST_ENVIRONMENT: asan - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 if: always() with: name: test-results-asan diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml index 58911b8..3f03d70 100644 --- a/.github/workflows/pylint.yaml +++ b/.github/workflows/pylint.yaml @@ -7,11 +7,11 @@ jobs: name: Check Linters runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install Tools diff --git a/.github/workflows/sbom.yaml b/.github/workflows/sbom.yaml index 72094c3..45cd37e 100644 --- a/.github/workflows/sbom.yaml +++ b/.github/workflows/sbom.yaml @@ -35,7 +35,7 @@ jobs: outputs: python-versions: ${{ steps.dynamic.outputs.pythons }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Read python-versions id: dynamic run: | @@ -48,14 +48,14 @@ jobs: needs: setup runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive # needed for vendor/sbdf-c when building/installing sdist # workflow_run: reuse artifact from build.yaml — no rebuild - name: Download sdist (from workflow_run) if: github.event_name == 'workflow_run' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: sdist path: dist @@ -64,7 +64,7 @@ jobs: # push / release / workflow_dispatch: build fresh - name: Set Up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Build sdist @@ -118,7 +118,7 @@ jobs: --tool "trivy-${{ env.TRIVY_VERSION }}" - name: Upload SBOM artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: sbom-sdist path: spotfire-sdist.sbom.spdx.json @@ -133,14 +133,14 @@ jobs: python-version: ${{ fromJson(needs.setup.outputs.python-versions) }} fail-fast: false steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: recursive # needed for vendor/sbdf-c when building wheel fresh # workflow_run: reuse the ubuntu wheel artifact from build.yaml — no rebuild - name: Download wheel (from workflow_run) if: github.event_name == 'workflow_run' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: wheel-${{ matrix.python-version }}-ubuntu-latest path: dist @@ -150,7 +150,7 @@ jobs: # Also download the sdist so scan-env can install from it (wheel is platform-specific) - name: Download sdist (from workflow_run) if: github.event_name == 'workflow_run' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: sdist path: dist @@ -160,7 +160,7 @@ jobs: # push / release / workflow_dispatch: build fresh on Linux - name: Set Up Python if: github.event_name != 'workflow_run' - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Build wheel @@ -221,7 +221,7 @@ jobs: --tool "trivy-${{ env.TRIVY_VERSION }}" - name: Upload SBOM artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: sbom-wheel-${{ matrix.python-version }} path: spotfire-wheel-${{ matrix.python-version }}.sbom.spdx.json @@ -234,7 +234,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download all SBOM artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: pattern: sbom-* path: all-sboms diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 4228537..f8e0a91 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -562,7 +562,8 @@ def test_all_null_temporal_columns(self): for spotfire_type, dtype in [("DateTime", "datetime64[ms]"), ("TimeSpan", "timedelta64[ms]")]: with self.subTest(type=spotfire_type): - dataframe = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], dtype=dtype)}) # type: ignore[call-overload] + dataframe = pd.DataFrame({"x": pd.array([pd.NaT, pd.NaT, pd.NaT], # type: ignore[call-overload] + dtype=dtype)}) new_df = self._roundtrip_dataframe(dataframe) self.assertEqual(len(new_df), 3) self.assertTrue(new_df["x"].isna().all()) From 544205bf9aac752b2c95ccaa1e0bac29a46f91ea Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 11:37:49 -0500 Subject: [PATCH 15/21] Fix: make geopandas/matplotlib/seaborn imports optional; add CI concurrency group test_sbdf.py imported geopandas, matplotlib, and seaborn unconditionally, causing ModuleNotFoundError in the ASan CI job where those packages are not installed. Change to try/except with None fallback (matching the polars pattern) and add @unittest.skipIf guards to test_read_write_geodata, test_image_matplot, test_image_seaborn. Also add concurrency group to build.yaml to cancel superseded runs on push. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/build.yaml | 3 +++ spotfire/test/test_sbdf.py | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 984335e..2bd24ef 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,5 +1,8 @@ name: Build and Test Package on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index f8e0a91..42601c4 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -11,9 +11,18 @@ import pandas as pd import pandas.testing as pdtest import numpy as np -import geopandas as gpd -import matplotlib.pyplot -import seaborn +try: + import geopandas as gpd # type: ignore[import-not-found] +except ImportError: + gpd = None # type: ignore[assignment] +try: + import matplotlib.pyplot # type: ignore[import-not-found] +except ImportError: + matplotlib = None # type: ignore[assignment] +try: + import seaborn # type: ignore[import-not-found] +except ImportError: + seaborn = None # type: ignore[assignment] import PIL.Image from packaging import version @@ -144,6 +153,7 @@ def test_read_10001(self): self.assertEqual(dataframe.at[10000, "String"], "kiwis") self.assertEqual(dataframe.at[10000, "Binary"], b"\x7c\x7d\x7e\x7f") + @unittest.skipIf(gpd is None, "geopandas not installed") def test_read_write_geodata(self): """Test that geo-encoded data is properly converted to/from ``GeoDataFrame``.""" gdf = sbdf.import_data(utils.get_test_data_file("sbdf/NACountries.sbdf")) @@ -675,6 +685,7 @@ def test_multichunk_export(self): self.assertEqual(new_df.at[0, "td"], datetime.timedelta(0)) self.assertEqual(new_df.at[n - 1, "td"], datetime.timedelta(seconds=-1)) + @unittest.skipIf(matplotlib is None, "matplotlib not installed") def test_image_matplot(self): """Verify Matplotlib figures export properly.""" matplotlib.pyplot.clf() @@ -687,6 +698,7 @@ def test_image_matplot(self): else: self.fail(f"Expected PNG bytes, got {type(image)}: {image!r}") + @unittest.skipIf(seaborn is None, "seaborn not installed") def test_image_seaborn(self): """Verify Seaborn grids export properly.""" matplotlib.pyplot.clf() From 53b608261b531f4d31d3c36fc593d0b1544ea21c Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 11:43:40 -0500 Subject: [PATCH 16/21] Fix: add explicit 'import matplotlib' so pylint recognises it as a module alias Without the explicit import, pylint sees 'matplotlib = None' in the except block as a new constant assignment and flags it as invalid-name (expects UPPER_CASE). Adding 'import matplotlib' before 'import matplotlib.pyplot' matches the same try/except pattern used for polars (import + None fallback). Co-Authored-By: Claude Sonnet 4.6 --- spotfire/test/test_sbdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 42601c4..9f096ba 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -16,7 +16,8 @@ except ImportError: gpd = None # type: ignore[assignment] try: - import matplotlib.pyplot # type: ignore[import-not-found] + import matplotlib # type: ignore[import-not-found] + import matplotlib.pyplot except ImportError: matplotlib = None # type: ignore[assignment] try: From d83e78fa5959cb4911797f2a920028563190e8c3 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 18:55:30 -0500 Subject: [PATCH 17/21] Perf: direct pointer access in string/binary C helpers; eliminate slice alloc in offset ptr Three C-level optimizations: 1. _export_extract_string_obj / _export_extract_binary_obj: replace per-element PySequence_GetItem calls (Python API dispatch + refcount overhead) with direct pointer arithmetic into numpy array buffers. Callers now pass PyArray_DATA(values_array) as void** and PyArray_DATA(invalid_array) as unsigned char*, eliminating ~2N Python API round-trips per string/binary column. 2. _export_get_offset_ptr: replace the Python slice allocation (array[start:start+count]) with direct byte-offset arithmetic on PyArray_DATA. Avoids a numpy view object allocation on every chunk/column export call. 3. Import string columns: pre-mask the numpy object array before pd.Series() construction instead of assigning None via .loc[] after the fact. The .loc path triggers pandas label-indexing overhead; direct numpy assignment is O(k) with no indexer allocation. Applied only when values.dtype.kind == 'O' to avoid incorrect coercion on bool/float arrays. Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf.pyx | 48 ++++++++++++++++++------ spotfire/sbdf_helpers.c | 79 +++++++++++++-------------------------- spotfire/sbdf_helpers.h | 10 +++-- spotfire/sbdf_helpers.pxi | 11 +++--- 4 files changed, 76 insertions(+), 72 deletions(-) diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx index 0b25f37..c2a1101 100644 --- a/spotfire/sbdf.pyx +++ b/spotfire/sbdf.pyx @@ -1063,9 +1063,16 @@ def import_data(sbdf_file, output_format=OutputFormat.PANDAS): pd.arrays.IntegerArray(values.astype(base_dtype), invalid_array), name=column_names[i]) else: - column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) - if invalid_array.any(): - column_series.loc[invalid_array] = None + if values.dtype.kind == 'O': + # Object-dtype (string) arrays can be pre-masked before Series construction, + # avoiding the pandas .loc indexing overhead on the post-construction path. + if invalid_array.any(): + values[invalid_array] = None + column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) + else: + column_series = pd.Series(values, dtype=dtype_name, name=column_names[i]) + if invalid_array.any(): + column_series.loc[invalid_array] = None imported_columns.append(column_series) dataframe = pd.DataFrame(dict(zip(column_names, imported_columns))) for i in range(num_columns): @@ -2118,13 +2125,19 @@ cdef int _export_vt_timespan(_ExportContext context, Py_ssize_t start, Py_ssize_ cdef int _export_vt_string(_ExportContext context, Py_ssize_t start, Py_ssize_t count, sbdf_c.sbdf_object** obj): """Export a slice of data consisting of string values.""" - obj[0] = _export_extract_string_obj(context.values_array, context.invalid_array, start, count) + obj[0] = _export_extract_string_obj( + np_c.PyArray_DATA(context.values_array), + np_c.PyArray_DATA(context.invalid_array), + start, count) return sbdf_c.SBDF_OK cdef int _export_vt_binary(_ExportContext context, Py_ssize_t start, Py_ssize_t count, sbdf_c.sbdf_object** obj): """Export a slice of data consisting of binary values.""" - obj[0] = _export_extract_binary_obj(context.values_array, context.invalid_array, start, count) + obj[0] = _export_extract_binary_obj( + np_c.PyArray_DATA(context.values_array), + np_c.PyArray_DATA(context.invalid_array), + start, count) return sbdf_c.SBDF_OK @@ -2363,15 +2376,16 @@ cdef (int, sbdf_c.sbdf_valuearray*) _export_process_invalid_array(_ExportContext cdef inline void* _export_get_offset_ptr(np_c.ndarray array, Py_ssize_t start, Py_ssize_t count): - """Slice a NumPy ``ndarray`` using Cython memoryviews. + """Return a pointer into ``array`` at element ``start``. :param array: the NumPy array to slice :param start: the index of the first element of the slice - :param count: the number of elements to include in the slice - :return: a pointer to the memory (owned by the NumPy array) of the slice + :param count: unused; kept for call-site compatibility + :return: a pointer to element ``start`` in the array's data buffer """ - cdef np_c.ndarray sliced = array[start : start + count] - return np_c.PyArray_DATA(sliced) + cdef char *base = np_c.PyArray_DATA(array) + cdef Py_ssize_t sz = array.itemsize + return (base + start * sz) cdef sbdf_c.sbdf_metadata_head* _export_metadata(dict md, int column_num): @@ -2414,7 +2428,12 @@ cdef sbdf_c.sbdf_metadata_head* _export_metadata(dict md, int column_num): val_type.id = _export_infer_valuetype_from_type(val, f"{metadata_description} metadata '{name_str}'") if val_type.id == sbdf_c.SBDF_STRINGTYPEID: - obj = _export_extract_string_obj(val, [False] * val_len, 0, val_len) + _meta_vals = np.asarray(val, dtype=object) + _meta_inv = np.zeros(val_len, dtype=bool) + obj = _export_extract_string_obj( + np_c.PyArray_DATA(_meta_vals), + np_c.PyArray_DATA(_meta_inv), + 0, val_len) error = sbdf_c.SBDF_OK elif val_type.id == sbdf_c.SBDF_DOUBLETYPEID: data_double = mem.PyMem_RawMalloc(val_len * sizeof(double)) @@ -2474,7 +2493,12 @@ cdef sbdf_c.sbdf_metadata_head* _export_metadata(dict md, int column_num): error = sbdf_c.sbdf_obj_create_arr(val_type, val_len, data_datetime, NULL, &obj) mem.PyMem_RawFree(data_datetime) elif val_type.id == sbdf_c.SBDF_BINARYTYPEID: - obj = _export_extract_binary_obj(val, [False] * val_len, 0, val_len) + _meta_vals = np.asarray(val, dtype=object) + _meta_inv = np.zeros(val_len, dtype=bool) + obj = _export_extract_binary_obj( + np_c.PyArray_DATA(_meta_vals), + np_c.PyArray_DATA(_meta_inv), + 0, val_len) error = sbdf_c.SBDF_OK elif val_type.id == sbdf_c.SBDF_DECIMALTYPEID: data_decimal = <_SbdfDecimal*>mem.PyMem_RawMalloc(val_len * sizeof(_SbdfDecimal)) diff --git a/spotfire/sbdf_helpers.c b/spotfire/sbdf_helpers.c index ce89a23..5931cdb 100644 --- a/spotfire/sbdf_helpers.c +++ b/spotfire/sbdf_helpers.c @@ -80,15 +80,22 @@ void _allocated_list_done(struct _AllocatedList *alist, _allocated_dealloc_fn fu } } -/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C */ -sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count) { +/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. + * vals_ptr is PyArray_DATA() of a numpy object array; each slot is a borrowed PyObject*. + * inv_ptr is PyArray_DATA() of a numpy bool array; nonzero byte means null/invalid. + */ +sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count) { sbdf_object *t = calloc(1, sizeof(sbdf_object)); + if (!t) { + PyErr_NoMemory(); + return NULL; + } t->type = sbdf_vt_string(); t->count = (int)count; char **data = (char **)calloc(count, sizeof(char *)); if (!data) { - PyErr_Format(PyExc_MemoryError, "memory exhausted"); + PyErr_NoMemory(); sbdf_obj_destroy(t); return NULL; } @@ -96,53 +103,33 @@ sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_s for (int i = 0; i < count; i++) { Py_ssize_t idx = start + i; - PyObject *inv = PySequence_GetItem(invalids, idx); - if (inv == NULL) { - sbdf_obj_destroy(t); - return NULL; - } - if (PyObject_IsTrue(inv)) { - /* true: invalid value, add empty value to t->data */ + if (inv_ptr[idx]) { + /* null/invalid value: write empty string */ data[i] = sbdf_str_create_len("", 0); } else { - /* false: valid value, add encoded value to t->data */ - PyObject *val = PySequence_GetItem(vals, idx); - if (val == NULL) { - Py_XDECREF(inv); - sbdf_obj_destroy(t); - return NULL; - } + /* valid value: borrowed ref from numpy object array — no Py_DECREF */ + PyObject *val = (PyObject *)vals_ptr[idx]; PyObject *val_str = PyObject_Str(val); if (val_str == NULL) { - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } PyObject *val_encoded = PyObject_CallMethod(val_str, "encode", "s", "utf-8"); + Py_DECREF(val_str); if (val_encoded == NULL) { - Py_XDECREF(val_str); - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } char *val_buf; Py_ssize_t val_len; if (PyBytes_AsStringAndSize(val_encoded, &val_buf, &val_len) == -1) { - Py_XDECREF(val_encoded); - Py_XDECREF(val_str); - Py_XDECREF(val); - Py_XDECREF(inv); + Py_DECREF(val_encoded); sbdf_obj_destroy(t); return NULL; } data[i] = sbdf_str_create_len(val_buf, (int)val_len); - Py_XDECREF(val_encoded); - Py_XDECREF(val_str); - Py_XDECREF(val); + Py_DECREF(val_encoded); } - Py_XDECREF(inv); } return t; @@ -178,14 +165,18 @@ sbdf_object *_export_extract_string_obj_arrow(const char *values_buf, const int6 return t; } -sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count) { +sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count) { sbdf_object *t = calloc(1, sizeof(sbdf_object)); + if (!t) { + PyErr_NoMemory(); + return NULL; + } t->type = sbdf_vt_binary(); t->count = (int)count; unsigned char **data = (unsigned char **)calloc(count, sizeof(unsigned char *)); if (!data) { - PyErr_Format(PyExc_MemoryError, "memory exhausted"); + PyErr_NoMemory(); sbdf_obj_destroy(t); return NULL; } @@ -193,41 +184,25 @@ sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_s for (int i = 0; i < count; i++) { Py_ssize_t idx = start + i; - PyObject *inv = PySequence_GetItem(invalids, idx); - if (inv == NULL) { - sbdf_obj_destroy(t); - return NULL; - } - if (PyObject_IsTrue(inv)) { - /* true: invalid value, add empty value to t->data */ + if (inv_ptr[idx]) { + /* null/invalid value: write empty byte array */ data[i] = sbdf_ba_create(0, 0); } else { - /* false: valid value, add value to t->data */ - PyObject *val = PySequence_GetItem(vals, idx); - if (val == NULL) { - Py_XDECREF(inv); - sbdf_obj_destroy(t); - return NULL; - } + /* valid value: borrowed ref from numpy object array — no Py_DECREF */ + PyObject *val = (PyObject *)vals_ptr[idx]; if (!PyBytes_Check(val)) { PyErr_Format(PyExc_SBDFError, "cannot convert '%S' to Spotfire Binary type; incompatible types", val); - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } char *val_buf; Py_ssize_t val_len; if (PyBytes_AsStringAndSize(val, &val_buf, &val_len) == -1) { - Py_XDECREF(val); - Py_XDECREF(inv); sbdf_obj_destroy(t); return NULL; } data[i] = sbdf_ba_create((unsigned char *)val_buf, (int)val_len); - Py_XDECREF(val); } - Py_XDECREF(inv); } return t; diff --git a/spotfire/sbdf_helpers.h b/spotfire/sbdf_helpers.h index 04e1255..95d90f6 100644 --- a/spotfire/sbdf_helpers.h +++ b/spotfire/sbdf_helpers.h @@ -35,9 +35,13 @@ struct _SbdfDecimal { unsigned char exponent_high_and_sign; }; -/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C */ -extern sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count); -extern sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count); +/* Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. + * vals_ptr: PyArray_DATA() of a numpy object array (array of PyObject* slots). + * inv_ptr: PyArray_DATA() of a numpy bool array (one byte per element, nonzero == null). + * Both pointers must remain valid for the duration of the call (caller holds the numpy arrays). + */ +extern sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count); +extern sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count); /* Fast string export directly from Arrow LargeUtf8 buffers: no Python str objects created. * values_buf: concatenated UTF-8 bytes from the Arrow values buffer. diff --git a/spotfire/sbdf_helpers.pxi b/spotfire/sbdf_helpers.pxi index ea719fa..d0e594b 100644 --- a/spotfire/sbdf_helpers.pxi +++ b/spotfire/sbdf_helpers.pxi @@ -21,11 +21,12 @@ cdef extern from "sbdf_helpers.h": unsigned char exponent_low unsigned char exponent_high_and_sign - # Utility functions for extracting strings from Python ``Union[str,bytes]`` into C - sbdf_c.sbdf_object* _export_extract_string_obj(object val, object invalids, Py_ssize_t start, Py_ssize_t count) \ - except NULL - sbdf_c.sbdf_object* _export_extract_binary_obj(object val, object invalids, Py_ssize_t start, Py_ssize_t count) \ - except NULL + # Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. + # vals_ptr: PyArray_DATA() of a numpy object array; inv_ptr: PyArray_DATA() of a numpy bool array. + sbdf_c.sbdf_object* _export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count) except NULL + sbdf_c.sbdf_object* _export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count) except NULL # Fast Arrow LargeUtf8 path: no Python str objects, no re-encoding sbdf_c.sbdf_object* _export_extract_string_obj_arrow(const char *values_buf, const long long *offsets, From 7c1ed6744af4869f71a989bf789c0b1d73c79fd0 Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 19:02:33 -0500 Subject: [PATCH 18/21] Fix: align continuation lines in sbdf_helpers.pxi to fix E127 pycodestyle violation Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf_helpers.pxi | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spotfire/sbdf_helpers.pxi b/spotfire/sbdf_helpers.pxi index d0e594b..c072c3b 100644 --- a/spotfire/sbdf_helpers.pxi +++ b/spotfire/sbdf_helpers.pxi @@ -23,10 +23,14 @@ cdef extern from "sbdf_helpers.h": # Utility functions for extracting strings from Python ``Union[str,bytes]`` into C. # vals_ptr: PyArray_DATA() of a numpy object array; inv_ptr: PyArray_DATA() of a numpy bool array. - sbdf_c.sbdf_object* _export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, - Py_ssize_t start, Py_ssize_t count) except NULL - sbdf_c.sbdf_object* _export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, - Py_ssize_t start, Py_ssize_t count) except NULL + sbdf_c.sbdf_object* _export_extract_string_obj(void **vals_ptr, + const unsigned char *inv_ptr, + Py_ssize_t start, + Py_ssize_t count) except NULL + sbdf_c.sbdf_object* _export_extract_binary_obj(void **vals_ptr, + const unsigned char *inv_ptr, + Py_ssize_t start, + Py_ssize_t count) except NULL # Fast Arrow LargeUtf8 path: no Python str objects, no re-encoding sbdf_c.sbdf_object* _export_extract_string_obj_arrow(const char *values_buf, const long long *offsets, From fd7479c8f46b45626d2171aede9ecc955ee6ea2c Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 19:57:14 -0500 Subject: [PATCH 19/21] linting --- spotfire/test/test_sbdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 9f096ba..101f596 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -1,4 +1,4 @@ -"""Tests for importing and exporting data to SBDF files.""" +git checkout """Tests for importing and exporting data to SBDF files.""" from pathlib import Path import datetime From bb1d35dda2f0970eb27e11b45f7b14ed8e5be89c Mon Sep 17 00:00:00 2001 From: stewjb Date: Sat, 4 Apr 2026 20:10:23 -0500 Subject: [PATCH 20/21] Fix: remove stray 'git checkout' prefix from test_sbdf.py module docstring Co-Authored-By: Claude Sonnet 4.6 --- spotfire/test/test_sbdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py index 101f596..9f096ba 100644 --- a/spotfire/test/test_sbdf.py +++ b/spotfire/test/test_sbdf.py @@ -1,4 +1,4 @@ -git checkout """Tests for importing and exporting data to SBDF files.""" +"""Tests for importing and exporting data to SBDF files.""" from pathlib import Path import datetime From 87d0d07185ae71e91d0c50f81e32a9450a274d5e Mon Sep 17 00:00:00 2001 From: stewjb Date: Sun, 5 Apr 2026 09:28:02 -0500 Subject: [PATCH 21/21] Fix: wrap long function signatures in sbdf_helpers.c/.h to satisfy cpplint line-length rule Co-Authored-By: Claude Sonnet 4.6 --- spotfire/sbdf_helpers.c | 6 ++++-- spotfire/sbdf_helpers.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/spotfire/sbdf_helpers.c b/spotfire/sbdf_helpers.c index 5931cdb..964593c 100644 --- a/spotfire/sbdf_helpers.c +++ b/spotfire/sbdf_helpers.c @@ -84,7 +84,8 @@ void _allocated_list_done(struct _AllocatedList *alist, _allocated_dealloc_fn fu * vals_ptr is PyArray_DATA() of a numpy object array; each slot is a borrowed PyObject*. * inv_ptr is PyArray_DATA() of a numpy bool array; nonzero byte means null/invalid. */ -sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count) { +sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count) { sbdf_object *t = calloc(1, sizeof(sbdf_object)); if (!t) { PyErr_NoMemory(); @@ -165,7 +166,8 @@ sbdf_object *_export_extract_string_obj_arrow(const char *values_buf, const int6 return t; } -sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count) { +sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count) { sbdf_object *t = calloc(1, sizeof(sbdf_object)); if (!t) { PyErr_NoMemory(); diff --git a/spotfire/sbdf_helpers.h b/spotfire/sbdf_helpers.h index 95d90f6..a263b01 100644 --- a/spotfire/sbdf_helpers.h +++ b/spotfire/sbdf_helpers.h @@ -40,8 +40,10 @@ struct _SbdfDecimal { * inv_ptr: PyArray_DATA() of a numpy bool array (one byte per element, nonzero == null). * Both pointers must remain valid for the duration of the call (caller holds the numpy arrays). */ -extern sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count); -extern sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, Py_ssize_t start, Py_ssize_t count); +extern sbdf_object *_export_extract_string_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count); +extern sbdf_object *_export_extract_binary_obj(void **vals_ptr, const unsigned char *inv_ptr, + Py_ssize_t start, Py_ssize_t count); /* Fast string export directly from Arrow LargeUtf8 buffers: no Python str objects created. * values_buf: concatenated UTF-8 bytes from the Arrow values buffer.