From 4868db9bc4408063ccf0968525de49e41f730859 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 20:06:54 -0500
Subject: [PATCH 01/38] feat: polars functionality

---
 .gitignore                 |   7 +++
 pyproject.toml             |   6 +-
 spotfire/sbdf.pyi          |   2 +-
 spotfire/sbdf.pyx          | 126 ++++++++++++++++++++++++++++++++++++-
 spotfire/test/test_sbdf.py |  77 +++++++++++++++++++++++
 5 files changed, 213 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8b22a61..9f0e1c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,13 @@ __pycache__/
 
 # virtual environments
 /venv/
+/.venv/
+
+# uv lock file (this is a library; lock files are for applications)
+/uv.lock
+
+# Claude Code
+/.claude
 
 # PyCharm project files
 /.idea
diff --git a/pyproject.toml b/pyproject.toml
index 9b68bf9..4588961 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,9 +54,13 @@ plot-seaborn = [
     "seaborn >= 0.13.2",
 ]
 plot = [ "spotfire[plot-matplotlib,plot-pil,plot-seaborn]" ]
+# Polars support
+polars = [
+    "polars >= 0.20.0",
+]
 # Development requirements
 dev = [
-    "spotfire[geo,plot]",
+    "spotfire[geo,plot,polars]",
     "Cython >= 3.0.4",
     "html-testRunner",
 ]
diff --git a/spotfire/sbdf.pyi b/spotfire/sbdf.pyi
index 625aff6..80d8fc4 100644
--- a/spotfire/sbdf.pyi
+++ b/spotfire/sbdf.pyi
@@ -13,6 +13,6 @@ class SBDFError(Exception): ...
 class SBDFWarning(Warning): ...
 
 def spotfire_typename_to_valuetype_id(typename: str) -> typing.Optional[int]: ...
-def import_data(sbdf_file: _FilenameLike): ...
+def import_data(sbdf_file: _FilenameLike, output_format: str = "pandas") -> typing.Any: ...
 def export_data(obj: typing.Any, sbdf_file: _FilenameLike, default_column_name: str = "x",
                 rows_per_slice: int = 0, encoding_rle: bool = True) -> None: ...
diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 2f005bf..ff10672 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -55,6 +55,11 @@ try:
 except ImportError:
     PIL = None
 
+try:
+    import polars as pl
+except ImportError:
+    pl = None
+
 
 # Various utility helper functions for doing things that are problematic in PYX files
 include "sbdf_helpers.pxi"
@@ -654,10 +659,11 @@ cdef dict _import_metadata(sbdf_c.sbdf_metadata_head* md, int column_num):
     return metadata
 
 
-def import_data(sbdf_file):
-    """Import data from an SBDF file and create a 'pandas' DataFrame.
+def import_data(sbdf_file, output_format="pandas"):
+    """Import data from an SBDF file and create a DataFrame.
 
     :param sbdf_file: the filename of the SBDF file to import
+    :param output_format: the format of the returned DataFrame; either 'pandas' (default) or 'polars'
     :return: the DataFrame containing the imported data
     :raises SBDFError: if a problem is encountered during import
     """
@@ -812,6 +818,10 @@ def import_data(sbdf_file):
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             dataframe.spotfire_table_metadata = table_metadata
+        if output_format == "polars":
+            if pl is None:
+                raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'")
+            return pl.from_pandas(dataframe)
         return dataframe
 
     finally:
@@ -1030,6 +1040,110 @@ cdef _export_obj_series(obj, default_column_name):
     return {}, [column_name], [column_metadata], [context]
 
 
+cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
+    """Determine a value type for a data set based on the Polars dtype for the series.
+
+    :param dtype: the Polars dtype object
+    :param series_description: description of series (for error reporting)
+    :return: the integer value type id representing the type of series
+    :raise SBDFError: if the dtype is unknown
+    """
+    dtype_name = dtype.__class__.__name__
+    if dtype_name == "Boolean":
+        return sbdf_c.SBDF_BOOLTYPEID
+    elif dtype_name in ("Int8", "Int16", "Int32", "UInt8", "UInt16"):
+        return sbdf_c.SBDF_INTTYPEID
+    elif dtype_name in ("Int64", "UInt32", "UInt64"):
+        return sbdf_c.SBDF_LONGTYPEID
+    elif dtype_name == "Float32":
+        return sbdf_c.SBDF_FLOATTYPEID
+    elif dtype_name == "Float64":
+        return sbdf_c.SBDF_DOUBLETYPEID
+    elif dtype_name in ("Utf8", "String"):
+        return sbdf_c.SBDF_STRINGTYPEID
+    elif dtype_name == "Date":
+        return sbdf_c.SBDF_DATETYPEID
+    elif dtype_name == "Datetime":
+        return sbdf_c.SBDF_DATETIMETYPEID
+    elif dtype_name == "Duration":
+        return sbdf_c.SBDF_TIMESPANTYPEID
+    elif dtype_name == "Time":
+        return sbdf_c.SBDF_TIMETYPEID
+    elif dtype_name == "Binary":
+        return sbdf_c.SBDF_BINARYTYPEID
+    elif dtype_name == "Decimal":
+        return sbdf_c.SBDF_DECIMALTYPEID
+    elif dtype_name == "Categorical":
+        return _export_infer_valuetype_from_polars_dtype(dtype.categories, series_description)
+    else:
+        raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")
+
+
+cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series):
+    """Convert a Polars Series to a NumPy array suitable for the SBDF exporter.
+
+    :param context: export context holding the resolved value type
+    :param series: Polars Series to convert
+    :return: NumPy ndarray of values
+    """
+    dtype_name = series.dtype.__class__.__name__
+    if dtype_name in ("Date", "Time"):
+        # The Date/Time exporters require Python date/time objects;
+        # Polars .to_numpy() returns numpy datetime64/int64 which those exporters do not accept.
+        return np.asarray(series.to_list(), dtype=object)
+    na_value = context.get_numpy_na_value()
+    if na_value is not None:
+        return np.asarray(series.fill_null(na_value).to_numpy(allow_copy=True),
+                          dtype=context.get_numpy_dtype())
+    else:
+        return np.asarray(series.to_numpy(allow_copy=True), dtype=object)
+
+
+cdef _export_obj_polars_dataframe(obj):
+    """Extract column information for a Polars ``DataFrame``.
+
+    :param obj: Polars DataFrame object to export
+    :return: tuple containing dictionary of table metadata, list of column names, list of dictionaries of column
+              metadata, and list of export context objects
+    """
+    if len(set(obj.columns)) != len(obj.columns):
+        raise SBDFError("obj does not have unique column names")
+
+    column_names = []
+    column_metadata = []
+    exporter_contexts = []
+    for col in obj.columns:
+        series = obj[col]
+        column_names.append(col)
+        context = _ExportContext()
+        context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(series.dtype, f"column '{col}'"))
+        invalids = series.is_null().to_numpy()
+        context.set_arrays(_export_polars_series_to_numpy(context, series), invalids)
+        column_metadata.append({})
+        exporter_contexts.append(context)
+
+    return {}, column_names, column_metadata, exporter_contexts
+
+
+cdef _export_obj_polars_series(obj, default_column_name):
+    """Extract column information for a Polars ``Series``.
+
+    :param obj: Polars Series object to export
+    :param default_column_name: column name to use when obj does not have a name
+    :return: tuple containing dict of table metadata, list of column names, list of dicts of column metadata, and
+              list of export context objects
+    """
+    column_name = obj.name if obj.name else default_column_name
+    description = f"series '{obj.name}'" if obj.name else "series"
+
+    context = _ExportContext()
+    context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(obj.dtype, description))
+    invalids = obj.is_null().to_numpy()
+    context.set_arrays(_export_polars_series_to_numpy(context, obj), invalids)
+
+    return {}, [column_name], [{}], [context]
+
+
 cdef _export_obj_numpy(np_c.ndarray obj, default_column_name):
     """Extract column information for a NumPy ``ndarray``.
 
@@ -1801,8 +1915,14 @@ def export_data(obj, sbdf_file, default_column_name="x", Py_ssize_t rows_per_sli
 
     try:
         # Extract data and metadata from obj
+        # Polars DataFrames (tabular)
+        if pl is not None and isinstance(obj, pl.DataFrame):
+            exported = _export_obj_polars_dataframe(obj)
+        # Polars Series (columnar)
+        elif pl is not None and isinstance(obj, pl.Series):
+            exported = _export_obj_polars_series(obj, default_column_name)
         # Pandas DataFrames (tabular)
-        if isinstance(obj, pd.DataFrame):
+        elif isinstance(obj, pd.DataFrame):
             exported = _export_obj_dataframe(obj)
         # Pandas Series (columnar)
         elif isinstance(obj, pd.Series):
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index de89774..13d2035 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -18,6 +18,11 @@
 from packaging import version
 
 import spotfire
+
+try:
+    import polars as pl
+except ImportError:
+    pl = None
 from spotfire import sbdf
 from spotfire.test import utils
 
@@ -539,3 +544,75 @@ def _assert_dataframe_shape(self, dataframe: pd.DataFrame, rows: int, column_nam
     def _assert_is_png_image(self, expr: bytes) -> None:
         """Assert that a bytes object represents PNG image data."""
         self.assertEqual(expr[0:8], b'\x89PNG\x0d\x0a\x1a\x0a')
+
+
+@unittest.skipIf(pl is None, "polars not installed")
+class SbdfPolarsTest(unittest.TestCase):
+    """Unit tests for Polars DataFrame support in 'spotfire.sbdf' module."""
+
+    def test_write_polars_dataframe_basic(self):
+        """Exporting a Polars DataFrame with common types should produce a valid SBDF file."""
+        df = pl.DataFrame({
+            "flag": [True, False, True],
+            "count": [1, 2, 3],
+            "value": [1.1, 2.2, 3.3],
+            "label": ["a", "b", "c"],
+        })
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            sbdf.export_data(df, path)
+            result = sbdf.import_data(path)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(list(result.columns), ["flag", "count", "value", "label"])
+        self.assertEqual(result["flag"].tolist(), [True, False, True])
+        self.assertEqual(result["count"].dropna().astype(int).tolist(), [1, 2, 3])
+        self.assertAlmostEqual(result["value"][0], 1.1)
+        self.assertEqual(result["label"].tolist(), ["a", "b", "c"])
+
+    def test_write_polars_dataframe_nulls(self):
+        """Exporting a Polars DataFrame with null values should preserve nulls."""
+        df = pl.DataFrame({
+            "ints": [1, None, 3],
+            "floats": [1.0, None, 3.0],
+            "strings": ["x", None, "z"],
+        })
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            sbdf.export_data(df, path)
+            result = sbdf.import_data(path)
+        self.assertTrue(pd.isnull(result["ints"][1]))
+        self.assertTrue(pd.isnull(result["floats"][1]))
+        self.assertTrue(pd.isnull(result["strings"][1]))
+
+    def test_write_polars_series(self):
+        """Exporting a Polars Series should produce a valid SBDF file."""
+        series = pl.Series("vals", [10, 20, 30])
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            sbdf.export_data(series, path)
+            result = sbdf.import_data(path)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result.columns[0], "vals")
+        self.assertEqual(result["vals"].dropna().astype(int).tolist(), [10, 20, 30])
+
+    def test_import_as_polars(self):
+        """Importing an SBDF file with output_format='polars' should return a Polars DataFrame."""
+        dataframe = sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format="polars")
+        self.assertIsInstance(dataframe, pl.DataFrame)
+        self.assertIn("Boolean", dataframe.columns)
+        self.assertIn("Integer", dataframe.columns)
+
+    def test_polars_roundtrip(self):
+        """A Polars DataFrame should survive an export/import roundtrip."""
+        original = pl.DataFrame({
+            "integers": [1, 2, 3],
+            "floats": [1.5, 2.5, 3.5],
+            "strings": ["foo", "bar", "baz"],
+        })
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/roundtrip.sbdf"
+            sbdf.export_data(original, path)
+            result = sbdf.import_data(path, output_format="polars")
+        self.assertIsInstance(result, pl.DataFrame)
+        self.assertEqual(result["strings"].to_list(), ["foo", "bar", "baz"])
+        self.assertAlmostEqual(result["floats"][0], 1.5)

From 82492e5d3f2429f1988e196f2246f5cb919a3283 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 20:45:54 -0500
Subject: [PATCH 02/38] linting and testing

---
 spotfire/sbdf.pyx          | 87 +++++++++++++++++++++++++++++++++++---
 spotfire/test/test_sbdf.py | 19 +++++----
 2 files changed, 93 insertions(+), 13 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index ff10672..234b588 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -425,6 +425,13 @@ cdef class _ImportContext:
         """
         return _valuetype_id_to_spotfire_typename(self.value_type.id)
 
+    cpdef bint is_object_numpy_type(self):
+        """Return True if the numpy type for this column is NPY_OBJECT.
+
+        :return: True if the numpy type is object, False otherwise
+        """
+        return self.numpy_type_num == np_c.NPY_OBJECT
+
 
 # Individual functions for importing each value type.
 ctypedef int(*importer_fn)(_ImportContext, sbdf_c.sbdf_columnslice*)
@@ -659,6 +666,74 @@ cdef dict _import_metadata(sbdf_c.sbdf_metadata_head* md, int column_num):
     return metadata
 
 
+cdef object _import_polars_dtype(_ImportContext context):
+    """Return the Polars dtype corresponding to the SBDF value type in the import context.
+
+    :param context: import context for a column
+    :return: the Polars dtype object
+    """
+    vt_id = context.value_type.id
+    if vt_id == sbdf_c.SBDF_BOOLTYPEID:
+        return pl.Boolean
+    elif vt_id == sbdf_c.SBDF_INTTYPEID:
+        return pl.Int32
+    elif vt_id == sbdf_c.SBDF_LONGTYPEID:
+        return pl.Int64
+    elif vt_id == sbdf_c.SBDF_FLOATTYPEID:
+        return pl.Float32
+    elif vt_id == sbdf_c.SBDF_DOUBLETYPEID:
+        return pl.Float64
+    elif vt_id == sbdf_c.SBDF_STRINGTYPEID:
+        return pl.Utf8
+    elif vt_id == sbdf_c.SBDF_DATETIMETYPEID:
+        return pl.Datetime
+    elif vt_id == sbdf_c.SBDF_DATETYPEID:
+        return pl.Date
+    elif vt_id == sbdf_c.SBDF_TIMETYPEID:
+        return pl.Time
+    elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID:
+        return pl.Duration
+    elif vt_id == sbdf_c.SBDF_BINARYTYPEID:
+        return pl.Binary
+    elif vt_id == sbdf_c.SBDF_DECIMALTYPEID:
+        return pl.Decimal
+    else:
+        return pl.Utf8
+
+
+cdef object _import_build_polars_dataframe(column_names, importer_contexts):
+    """Build a Polars DataFrame directly from import context data, with no Pandas intermediary.
+
+    :param column_names: list of column name strings
+    :param importer_contexts: list of _ImportContext objects
+    :return: a Polars DataFrame
+    """
+    series_list = []
+    for i, name in enumerate(column_names):
+        context = importer_contexts[i]
+        values = context.get_values_array()
+        invalids = context.get_invalid_array()
+        polars_dtype = _import_polars_dtype(context)
+
+        if context.is_object_numpy_type():
+            # Object arrays hold Python objects (str, date, datetime, etc.); Polars cannot
+            # construct a typed series from a numpy object array directly — use a Python list.
+            values_list = values.tolist()
+            if invalids.any():
+                for idx in np.where(invalids)[0]:
+                    values_list[idx] = None
+            col = pl.Series(name=name, values=values_list, dtype=polars_dtype)
+        else:
+            # Numeric arrays: numpy → Polars Series directly, then scatter nulls if needed.
+            col = pl.Series(name=name, values=values, dtype=polars_dtype)
+            if invalids.any():
+                col = col.scatter(np.where(invalids)[0].tolist(), None)
+
+        series_list.append(col)
+
+    return pl.DataFrame(series_list)
+
+
 def import_data(sbdf_file, output_format="pandas"):
     """Import data from an SBDF file and create a DataFrame.
 
@@ -780,7 +855,13 @@ def import_data(sbdf_file, output_format="pandas"):
         if error != sbdf_c.SBDF_OK and error != sbdf_c.SBDF_TABLEEND:
             raise SBDFError(f"error reading '{sbdf_file}': {sbdf_c.sbdf_err_get_str(error).decode('utf-8')}")
 
-        # Build a new DataFrame with the results
+        # Build a Polars DataFrame directly if requested, with no Pandas intermediary
+        if output_format == "polars":
+            if pl is None:
+                raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'")
+            return _import_build_polars_dataframe(column_names, importer_contexts)
+
+        # Build a new Pandas DataFrame with the results
         imported_columns = []
         for i in range(num_columns):
             column_series = pd.Series(importer_contexts[i].get_values_array(),
@@ -818,10 +899,6 @@ def import_data(sbdf_file, output_format="pandas"):
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             dataframe.spotfire_table_metadata = table_metadata
-        if output_format == "polars":
-            if pl is None:
-                raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'")
-            return pl.from_pandas(dataframe)
         return dataframe
 
     finally:
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 13d2035..c9e9e79 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -22,7 +22,7 @@
 try:
     import polars as pl
 except ImportError:
-    pl = None
+    pl = None  # type: ignore[assignment]
 from spotfire import sbdf
 from spotfire.test import utils
 
@@ -550,9 +550,9 @@ def _assert_is_png_image(self, expr: bytes) -> None:
 class SbdfPolarsTest(unittest.TestCase):
     """Unit tests for Polars DataFrame support in 'spotfire.sbdf' module."""
 
-    def test_write_polars_dataframe_basic(self):
+    def test_write_polars_basic(self):
         """Exporting a Polars DataFrame with common types should produce a valid SBDF file."""
-        df = pl.DataFrame({
+        polars_df = pl.DataFrame({
             "flag": [True, False, True],
             "count": [1, 2, 3],
             "value": [1.1, 2.2, 3.3],
@@ -560,7 +560,7 @@ def test_write_polars_dataframe_basic(self):
         })
         with tempfile.TemporaryDirectory() as tempdir:
             path = f"{tempdir}/output.sbdf"
-            sbdf.export_data(df, path)
+            sbdf.export_data(polars_df, path)
             result = sbdf.import_data(path)
         self.assertEqual(len(result), 3)
         self.assertEqual(list(result.columns), ["flag", "count", "value", "label"])
@@ -569,16 +569,16 @@ def test_write_polars_dataframe_basic(self):
         self.assertAlmostEqual(result["value"][0], 1.1)
         self.assertEqual(result["label"].tolist(), ["a", "b", "c"])
 
-    def test_write_polars_dataframe_nulls(self):
+    def test_write_polars_nulls(self):
         """Exporting a Polars DataFrame with null values should preserve nulls."""
-        df = pl.DataFrame({
+        polars_df = pl.DataFrame({
             "ints": [1, None, 3],
             "floats": [1.0, None, 3.0],
             "strings": ["x", None, "z"],
         })
         with tempfile.TemporaryDirectory() as tempdir:
             path = f"{tempdir}/output.sbdf"
-            sbdf.export_data(df, path)
+            sbdf.export_data(polars_df, path)
             result = sbdf.import_data(path)
         self.assertTrue(pd.isnull(result["ints"][1]))
         self.assertTrue(pd.isnull(result["floats"][1]))
@@ -596,11 +596,14 @@ def test_write_polars_series(self):
         self.assertEqual(result["vals"].dropna().astype(int).tolist(), [10, 20, 30])
 
     def test_import_as_polars(self):
-        """Importing an SBDF file with output_format='polars' should return a Polars DataFrame."""
+        """Importing an SBDF file with output_format='polars' should return a native Polars DataFrame."""
         dataframe = sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format="polars")
         self.assertIsInstance(dataframe, pl.DataFrame)
+        self.assertNotIsInstance(dataframe, pd.DataFrame)
         self.assertIn("Boolean", dataframe.columns)
         self.assertIn("Integer", dataframe.columns)
+        # Verify nulls are preserved natively
+        self.assertIsNone(dataframe["Long"][0])
 
     def test_polars_roundtrip(self):
         """A Polars DataFrame should survive an export/import roundtrip."""

From 003029192d2499296b577ed89c4f01b295515dc0 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 21:07:12 -0500
Subject: [PATCH 03/38] Fix Polars edge cases: Categorical/Enum, UInt64
 overflow, tz-aware Datetime, scatter compat

- Fix Categorical/Enum dtype: was incorrectly trying to recurse into
  dtype.categories (which doesn't exist on the dtype object); now casts
  series to Utf8 and maps to SBDF_STRINGTYPEID directly
- Add Enum dtype support (previously raised SBDFError)
- Warn on UInt64 export: values above Int64 max will overflow silently
- Warn on timezone-aware Datetime export: tz info is not preserved in SBDF
- Warn on Decimal export: marked experimental, precision may be lost
- Fix scatter() compatibility: add AttributeError fallback to set_at_idx()
  for older Polars versions within the supported range
- Add tests for all of the above

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx          | 24 +++++++++++++++++++++---
 spotfire/test/test_sbdf.py | 27 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 234b588..4b0097f 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -727,7 +727,12 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             # Numeric arrays: numpy → Polars Series directly, then scatter nulls if needed.
             col = pl.Series(name=name, values=values, dtype=polars_dtype)
             if invalids.any():
-                col = col.scatter(np.where(invalids)[0].tolist(), None)
+                indices = np.where(invalids)[0].tolist()
+                try:
+                    col = col.scatter(indices, None)
+                except AttributeError:
+                    # Fallback for older Polars versions that use set_at_idx
+                    col = col.set_at_idx(indices, None)
 
         series_list.append(col)
 
@@ -1131,6 +1136,9 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
     elif dtype_name in ("Int8", "Int16", "Int32", "UInt8", "UInt16"):
         return sbdf_c.SBDF_INTTYPEID
     elif dtype_name in ("Int64", "UInt32", "UInt64"):
+        if dtype_name == "UInt64":
+            warnings.warn(f"Polars UInt64 type in {series_description} will be exported as LongInteger (signed "
+                          f"64-bit); values above 9,223,372,036,854,775,807 will overflow", SBDFWarning)
         return sbdf_c.SBDF_LONGTYPEID
     elif dtype_name == "Float32":
         return sbdf_c.SBDF_FLOATTYPEID
@@ -1141,6 +1149,9 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
     elif dtype_name == "Date":
         return sbdf_c.SBDF_DATETYPEID
     elif dtype_name == "Datetime":
+        if getattr(dtype, 'time_zone', None) is not None:
+            warnings.warn(f"Polars Datetime type in {series_description} has timezone '{dtype.time_zone}'; "
+                          f"timezone information will not be preserved in SBDF", SBDFWarning)
         return sbdf_c.SBDF_DATETIMETYPEID
     elif dtype_name == "Duration":
         return sbdf_c.SBDF_TIMESPANTYPEID
@@ -1149,9 +1160,12 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
     elif dtype_name == "Binary":
         return sbdf_c.SBDF_BINARYTYPEID
     elif dtype_name == "Decimal":
+        warnings.warn(f"Polars Decimal type in {series_description} export is experimental; "
+                      f"precision may not be fully preserved", SBDFWarning)
         return sbdf_c.SBDF_DECIMALTYPEID
-    elif dtype_name == "Categorical":
-        return _export_infer_valuetype_from_polars_dtype(dtype.categories, series_description)
+    elif dtype_name in ("Categorical", "Enum"):
+        # SBDF has no categorical type; export as String
+        return sbdf_c.SBDF_STRINGTYPEID
     else:
         raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")
 
@@ -1164,6 +1178,10 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series)
     :return: NumPy ndarray of values
     """
     dtype_name = series.dtype.__class__.__name__
+    if dtype_name in ("Categorical", "Enum"):
+        # Cast to String so .to_numpy() returns plain Python strings
+        series = series.cast(pl.Utf8)
+        dtype_name = "Utf8"
     if dtype_name in ("Date", "Time"):
         # The Date/Time exporters require Python date/time objects;
         # Polars .to_numpy() returns numpy datetime64/int64 which those exporters do not accept.
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index c9e9e79..8c2a709 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -605,6 +605,33 @@ def test_import_as_polars(self):
         # Verify nulls are preserved natively
         self.assertIsNone(dataframe["Long"][0])
 
+    def test_write_polars_categorical(self):
+        """Exporting a Polars Categorical column should export as String."""
+        polars_df = pl.DataFrame({"cat": pl.Series(["a", "b", "a"]).cast(pl.Categorical)})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            sbdf.export_data(polars_df, path)
+            result = sbdf.import_data(path)
+        self.assertEqual(result["cat"].tolist(), ["a", "b", "a"])
+
+    def test_write_polars_uint64_warns(self):
+        """Exporting a Polars UInt64 column should emit a warning about overflow risk."""
+        polars_df = pl.DataFrame({"big": pl.Series([1, 2, 3], dtype=pl.UInt64)})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            with self.assertWarns(sbdf.SBDFWarning):
+                sbdf.export_data(polars_df, path)
+
+    def test_write_polars_datetime_tz(self):
+        """Exporting a timezone-aware Polars Datetime column should warn about timezone loss."""
+        polars_df = pl.DataFrame({
+            "ts": pl.Series([datetime.datetime(2024, 1, 1)]).dt.replace_time_zone("UTC")
+        })
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            with self.assertWarns(sbdf.SBDFWarning):
+                sbdf.export_data(polars_df, path)
+
     def test_polars_roundtrip(self):
         """A Polars DataFrame should survive an export/import roundtrip."""
         original = pl.DataFrame({

From cef91075583b311f7bc56c898d362c18b3f2abfd Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 21:20:04 -0500
Subject: [PATCH 04/38] Add polars to CI test requirements and document in
 README

- Add polars to test_requirements_default.txt so SbdfPolarsTest is
  actually executed in CI (previously skipped due to missing import)
- Add spotfire[polars] row to extras table in README
- Add usage note explaining Spotfire's bundled Python lacks Polars and
  that SPKs bundling Polars will be ~44 MB larger than typical packages

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md                     | 9 +++++++++
 test_requirements_default.txt | 1 +
 2 files changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 14b0297..62dab02 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,16 @@ simply `spotfire`) to include the required Python packages to support optional f
 | `spotfire[plot-matplotlib]` | Plotting support using just `matplotlib`     |
 | `spotfire[plot-pil]`        | Plotting support using just `Pillow`         |
 | `spotfire[plot-seaborn]`    | Plotting support using just `seaborn`        |
+| `spotfire[polars]`          | Polars DataFrame support                     |
 | `spotfire[dev,lint]`        | Internal development                         |
 
+Once installed, `export_data()` accepts `polars.DataFrame` and `polars.Series` directly, and
+`import_data()` can return a `polars.DataFrame` via `output_format="polars"`.
+
+> **Note for Spotfire data functions:** Spotfire's bundled Python interpreter does not include
+> Polars. To use Polars inside a data function, configure Spotfire to use a custom Python
+> environment that has `polars` installed. Polars is a large binary package (~44 MB), so
+> Spotfire Packages (SPKs) that bundle it will be significantly larger than typical packages.
+
 ### License
 BSD-type 3-Clause License.  See the file ```LICENSE``` included in the package.
\ No newline at end of file
diff --git a/test_requirements_default.txt b/test_requirements_default.txt
index 73ab30d..7468679 100644
--- a/test_requirements_default.txt
+++ b/test_requirements_default.txt
@@ -2,5 +2,6 @@ html-testRunner
 geopandas
 matplotlib
 pillow
+polars
 seaborn
 shapely
\ No newline at end of file

From 1bd219849b44847aad92e6cbac0a2da978f396cc Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 21:36:05 -0500
Subject: [PATCH 05/38] Harden Polars support: validation, warnings, and edge
 case tests

- Raise SBDFError for unknown output_format values (previously fell
  through silently to Pandas)
- Emit SBDFWarning when Categorical/Enum columns are exported as String,
  consistent with existing UInt64 and timezone warnings
- Add test_invalid_output_format: verifies bad output_format raises
- Add test_write_polars_empty: verifies empty DataFrame exports cleanly
- Add test_write_polars_series_nulls: verifies null preservation in Series
- Add test_polars_categorical_warns: verifies Categorical warning fires

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx          |  6 ++++++
 spotfire/test/test_sbdf.py | 40 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 4b0097f..faea3b6 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -860,6 +860,10 @@ def import_data(sbdf_file, output_format="pandas"):
         if error != sbdf_c.SBDF_OK and error != sbdf_c.SBDF_TABLEEND:
             raise SBDFError(f"error reading '{sbdf_file}': {sbdf_c.sbdf_err_get_str(error).decode('utf-8')}")
 
+        # Validate output_format before doing anything with it
+        if output_format not in ("pandas", "polars"):
+            raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'")
+
         # Build a Polars DataFrame directly if requested, with no Pandas intermediary
         if output_format == "polars":
             if pl is None:
@@ -1165,6 +1169,8 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
         return sbdf_c.SBDF_DECIMALTYPEID
     elif dtype_name in ("Categorical", "Enum"):
         # SBDF has no categorical type; export as String
+        warnings.warn(f"Polars {dtype_name} type in {series_description} will be exported as String; "
+                      f"category information will not be preserved", SBDFWarning)
         return sbdf_c.SBDF_STRINGTYPEID
     else:
         raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 8c2a709..eb4cf17 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -646,3 +646,43 @@ def test_polars_roundtrip(self):
         self.assertIsInstance(result, pl.DataFrame)
         self.assertEqual(result["strings"].to_list(), ["foo", "bar", "baz"])
         self.assertAlmostEqual(result["floats"][0], 1.5)
+
+    def test_invalid_output_format(self):
+        """Passing an unknown output_format should raise SBDFError immediately."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/output.sbdf"
+            sbdf.export_data(polars_df, path)
+            with self.assertRaises(sbdf.SBDFError):
+                sbdf.import_data(path, output_format="numpy")
+
+    def test_write_polars_empty(self):
+        """Exporting an empty Polars DataFrame should produce a valid (empty) SBDF file."""
+        polars_df = pl.DataFrame({"a": pl.Series([], dtype=pl.Int32),
+                                  "b": pl.Series([], dtype=pl.Utf8)})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/empty.sbdf"
+            sbdf.export_data(polars_df, path)
+            result = sbdf.import_data(path)
+        self.assertEqual(len(result), 0)
+        self.assertIn("a", result.columns)
+        self.assertIn("b", result.columns)
+
+    def test_write_polars_series_nulls(self):
+        """Exporting a Polars Series with null values should preserve those nulls."""
+        series = pl.Series("vals", [1, None, 3], dtype=pl.Int32)
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/series_nulls.sbdf"
+            sbdf.export_data(series, path)
+            result = sbdf.import_data(path)
+        self.assertTrue(pd.isnull(result["vals"][1]))
+        self.assertEqual(int(result["vals"][0]), 1)
+        self.assertEqual(int(result["vals"][2]), 3)
+
+    def test_polars_categorical_warns(self):
+        """Exporting a Polars Categorical column should emit a SBDFWarning."""
+        polars_df = pl.DataFrame({"cat": pl.Series(["x", "y", "x"]).cast(pl.Categorical)})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/cat_warn.sbdf"
+            with self.assertWarns(sbdf.SBDFWarning):
+                sbdf.export_data(polars_df, path)

From 6761de013f0f01d956dbe3f57d9de4f1dfa80bb6 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 21:44:50 -0500
Subject: [PATCH 06/38] Handle Polars Null dtype on export

A Polars Series of [None, None, None] has dtype pl.Null (no type can
be inferred). Previously this raised SBDFError with "unknown dtype".
Now it exports as an all-invalid String column, consistent with how
all-None Pandas columns are handled.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx          |  6 ++++++
 spotfire/test/test_sbdf.py | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index faea3b6..b247a5b 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1172,6 +1172,9 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
         warnings.warn(f"Polars {dtype_name} type in {series_description} will be exported as String; "
                       f"category information will not be preserved", SBDFWarning)
         return sbdf_c.SBDF_STRINGTYPEID
+    elif dtype_name == "Null":
+        # All-null series with no inferred type; export as an all-invalid String column
+        return sbdf_c.SBDF_STRINGTYPEID
     else:
         raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")
 
@@ -1184,6 +1187,9 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series)
     :return: NumPy ndarray of values
     """
     dtype_name = series.dtype.__class__.__name__
+    if dtype_name == "Null":
+        # All-null series: produce an object array of Nones; invalids mask will cover all rows
+        return np.full(len(series), None, dtype=object)
     if dtype_name in ("Categorical", "Enum"):
         # Cast to String so .to_numpy() returns plain Python strings
         series = series.cast(pl.Utf8)
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index eb4cf17..ce1008b 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -686,3 +686,15 @@ def test_polars_categorical_warns(self):
             path = f"{tempdir}/cat_warn.sbdf"
             with self.assertWarns(sbdf.SBDFWarning):
                 sbdf.export_data(polars_df, path)
+
+    def test_write_polars_null_dtype(self):
+        """Exporting a Polars all-null Series (dtype=Null) should produce an all-invalid column."""
+        polars_df = pl.DataFrame({"nothing": pl.Series([None, None, None])})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/null_dtype.sbdf"
+            sbdf.export_data(polars_df, path)
+            result = sbdf.import_data(path)
+        self.assertEqual(len(result), 3)
+        self.assertTrue(pd.isnull(result["nothing"][0]))
+        self.assertTrue(pd.isnull(result["nothing"][1]))
+        self.assertTrue(pd.isnull(result["nothing"][2]))

From 441cddbe0cd8bf1c6fe4e6b217f359bafa198b2b Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 22:00:56 -0500
Subject: [PATCH 07/38] Fix mypy error for polars import in test file

CI static analysis runs mypy without polars installed; add
type: ignore[import-not-found] so mypy skips the missing stub.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index ce1008b..4cf944b 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -20,7 +20,7 @@
 import spotfire
 
 try:
-    import polars as pl
+    import polars as pl  # type: ignore[import-not-found]
 except ImportError:
     pl = None  # type: ignore[assignment]
 from spotfire import sbdf

From a0a86ceb851b338ef4a92d25604555430ede25db Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 22:12:13 -0500
Subject: [PATCH 08/38] Add reviewer-facing comments to Polars implementation

Explain non-obvious choices that would otherwise prompt review questions:
- Why dtype.__class__.__name__ instead of isinstance()
- Why scatter()/set_at_idx() try/except exists and which versions it covers
- Why is_object_numpy_type() cpdef wrapper is needed for a cdef attribute
- Why the output_format polars path short-circuits before pd.concat
- Why the Null dtype path returns a placeholder array

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index b247a5b..20890d6 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -429,6 +429,11 @@ cdef class _ImportContext:
         """Return True if the numpy type for this column is NPY_OBJECT.
 
         :return: True if the numpy type is object, False otherwise
+
+        .. note:: ``numpy_type_num`` is a ``cdef`` attribute and is therefore inaccessible from
+                  Python-side ``cdef object`` functions.  This ``cpdef`` wrapper exposes it so that
+                  :func:`_import_build_polars_dataframe` can branch on it without touching the
+                  Cython-only attribute directly.
         """
         return self.numpy_type_num == np_c.NPY_OBJECT
 
@@ -729,10 +734,9 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             if invalids.any():
                 indices = np.where(invalids)[0].tolist()
                 try:
-                    col = col.scatter(indices, None)
+                    col = col.scatter(indices, None)  # Polars >= 0.19
                 except AttributeError:
-                    # Fallback for older Polars versions that use set_at_idx
-                    col = col.set_at_idx(indices, None)
+                    col = col.set_at_idx(indices, None)  # Polars < 0.19 API
 
         series_list.append(col)
 
@@ -864,7 +868,10 @@ def import_data(sbdf_file, output_format="pandas"):
         if output_format not in ("pandas", "polars"):
             raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'")
 
-        # Build a Polars DataFrame directly if requested, with no Pandas intermediary
+        # Short-circuit before pd.concat to avoid the Pandas intermediary entirely.
+        # This keeps the import zero-copy for large DataFrames: numpy arrays collected
+        # by each _ImportContext go straight into Polars Series without ever becoming
+        # a Pandas DataFrame.
         if output_format == "polars":
             if pl is None:
                 raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'")
@@ -1134,6 +1141,10 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
     :return: the integer value type id representing the type of series
     :raise SBDFError: if the dtype is unknown
     """
+    # Use __class__.__name__ rather than isinstance() checks.  Polars dtype objects are
+    # not ordinary Python classes resolvable at Cython compile time, so isinstance() would
+    # require importing the exact dtype class — which breaks when Polars isn't installed.
+    # Class name strings are stable across the Polars versions we support (>= 0.20).
     dtype_name = dtype.__class__.__name__
     if dtype_name == "Boolean":
         return sbdf_c.SBDF_BOOLTYPEID
@@ -1173,7 +1184,9 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
                       f"category information will not be preserved", SBDFWarning)
         return sbdf_c.SBDF_STRINGTYPEID
     elif dtype_name == "Null":
-        # All-null series with no inferred type; export as an all-invalid String column
+        # pl.Series([None, None]) has dtype Null when no type can be inferred.  Export as
+        # String; _export_polars_series_to_numpy produces a placeholder array and the
+        # invalids mask marks every row missing, so the stored values are never read.
         return sbdf_c.SBDF_STRINGTYPEID
     else:
         raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")
@@ -1188,7 +1201,9 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series)
     """
     dtype_name = series.dtype.__class__.__name__
     if dtype_name == "Null":
-        # All-null series: produce an object array of Nones; invalids mask will cover all rows
+        # A Null-dtype series has no values to convert; return a same-length placeholder array.
+        # The invalids mask (set by the caller via series.is_null()) marks every row as missing,
+        # so the placeholder values are never read by the SBDF writer.
         return np.full(len(series), None, dtype=object)
     if dtype_name in ("Categorical", "Enum"):
         # Cast to String so .to_numpy() returns plain Python strings

From bf8e984ded4cc10385b08303cee1bcb23346cf5e Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Mon, 23 Mar 2026 22:15:56 -0500
Subject: [PATCH 09/38] Remove set_at_idx fallback; scatter() is available in
 all supported Polars versions (>= 0.20)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 20890d6..7b90a09 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -733,10 +733,7 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             col = pl.Series(name=name, values=values, dtype=polars_dtype)
             if invalids.any():
                 indices = np.where(invalids)[0].tolist()
-                try:
-                    col = col.scatter(indices, None)  # Polars >= 0.19
-                except AttributeError:
-                    col = col.set_at_idx(indices, None)  # Polars < 0.19 API
+                col = col.scatter(indices, None)
 
         series_list.append(col)
 

From 00d81cff097e7d3593cc80b43e702525b354ada6 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Tue, 24 Mar 2026 05:23:24 -0500
Subject: [PATCH 10/38] Address Copilot review comments

- Move output_format validation to top of import_data() for fail-fast
  behaviour before the file is opened
- Raise SBDFError in _import_polars_dtype fallback instead of silently
  returning Utf8 for unknown SBDF type IDs
- Treat NaN as invalid (missing) for Float32/Float64 columns, matching
  Pandas pd.isnull() behaviour; add test_write_polars_float_nan
- Keep native datetime64/timedelta64 arrays for Datetime/Duration columns
  instead of boxing to object dtype (avoids unnecessary copy)
- Add @overload signatures to sbdf.pyi so callers get pd.DataFrame for
  the default output_format="pandas" and Any for output_format="polars"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyi          |  4 ++++
 spotfire/sbdf.pyx          | 24 +++++++++++++++++-------
 spotfire/test/test_sbdf.py | 11 +++++++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/spotfire/sbdf.pyi b/spotfire/sbdf.pyi
index 80d8fc4..9bd2812 100644
--- a/spotfire/sbdf.pyi
+++ b/spotfire/sbdf.pyi
@@ -13,6 +13,10 @@ class SBDFError(Exception): ...
 class SBDFWarning(Warning): ...
 
 def spotfire_typename_to_valuetype_id(typename: str) -> typing.Optional[int]: ...
+@typing.overload
+def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["pandas"] = ...) -> pd.DataFrame: ...
+@typing.overload
+def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["polars"]) -> typing.Any: ...
 def import_data(sbdf_file: _FilenameLike, output_format: str = "pandas") -> typing.Any: ...
 def export_data(obj: typing.Any, sbdf_file: _FilenameLike, default_column_name: str = "x",
                 rows_per_slice: int = 0, encoding_rle: bool = True) -> None: ...
diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 7b90a09..28770f5 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -703,7 +703,7 @@ cdef object _import_polars_dtype(_ImportContext context):
     elif vt_id == sbdf_c.SBDF_DECIMALTYPEID:
         return pl.Decimal
     else:
-        return pl.Utf8
+        raise SBDFError(f"unsupported SBDF value type id {vt_id} for Polars output")
 
 
 cdef object _import_build_polars_dataframe(column_names, importer_contexts):
@@ -748,6 +748,10 @@ def import_data(sbdf_file, output_format="pandas"):
     :return: the DataFrame containing the imported data
     :raises SBDFError: if a problem is encountered during import
     """
+    # Validate output_format before opening the file so we fail fast on bad input.
+    if output_format not in ("pandas", "polars"):
+        raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'")
+
     cdef int error, i
     cdef stdio.FILE* input_file = NULL
     cdef int major_v, minor_v
@@ -861,10 +865,6 @@ def import_data(sbdf_file, output_format="pandas"):
         if error != sbdf_c.SBDF_OK and error != sbdf_c.SBDF_TABLEEND:
             raise SBDFError(f"error reading '{sbdf_file}': {sbdf_c.sbdf_err_get_str(error).decode('utf-8')}")
 
-        # Validate output_format before doing anything with it
-        if output_format not in ("pandas", "polars"):
-            raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'")
-
         # Short-circuit before pd.concat to avoid the Pandas intermediary entirely.
         # This keeps the import zero-copy for large DataFrames: numpy arrays collected
         # by each _ImportContext go straight into Polars Series without ever becoming
@@ -1210,6 +1210,10 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series)
         # The Date/Time exporters require Python date/time objects;
         # Polars .to_numpy() returns numpy datetime64/int64 which those exporters do not accept.
         return np.asarray(series.to_list(), dtype=object)
+    if dtype_name in ("Datetime", "Duration"):
+        # Keep native datetime64/timedelta64 arrays; the invalids mask handles nulls (NaT cells
+        # are marked invalid and ignored by the SBDF writer).  Boxing to object would be slower.
+        return series.to_numpy(allow_copy=True)
     na_value = context.get_numpy_na_value()
     if na_value is not None:
         return np.asarray(series.fill_null(na_value).to_numpy(allow_copy=True),
@@ -1236,7 +1240,10 @@ cdef _export_obj_polars_dataframe(obj):
         column_names.append(col)
         context = _ExportContext()
         context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(series.dtype, f"column '{col}'"))
-        invalids = series.is_null().to_numpy()
+        if series.dtype.__class__.__name__ in ("Float32", "Float64"):
+            invalids = (series.is_null() | series.is_nan()).to_numpy()
+        else:
+            invalids = series.is_null().to_numpy()
         context.set_arrays(_export_polars_series_to_numpy(context, series), invalids)
         column_metadata.append({})
         exporter_contexts.append(context)
@@ -1257,7 +1264,10 @@ cdef _export_obj_polars_series(obj, default_column_name):
 
     context = _ExportContext()
     context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(obj.dtype, description))
-    invalids = obj.is_null().to_numpy()
+    if obj.dtype.__class__.__name__ in ("Float32", "Float64"):
+        invalids = (obj.is_null() | obj.is_nan()).to_numpy()
+    else:
+        invalids = obj.is_null().to_numpy()
     context.set_arrays(_export_polars_series_to_numpy(context, obj), invalids)
 
     return {}, [column_name], [{}], [context]
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 4cf944b..b048ac5 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -698,3 +698,14 @@ def test_write_polars_null_dtype(self):
         self.assertTrue(pd.isnull(result["nothing"][0]))
         self.assertTrue(pd.isnull(result["nothing"][1]))
         self.assertTrue(pd.isnull(result["nothing"][2]))
+
+    def test_write_polars_float_nan(self):
+        """NaN in a Polars float column should be treated as invalid (missing), not a real value."""
+        polars_df = pl.DataFrame({"vals": pl.Series([1.0, float("nan"), 3.0])})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/float_nan.sbdf"
+            sbdf.export_data(polars_df, path)
+            result = sbdf.import_data(path)
+        self.assertAlmostEqual(result["vals"][0], 1.0)
+        self.assertTrue(pd.isnull(result["vals"][1]))
+        self.assertAlmostEqual(result["vals"][2], 3.0)

From 79d62d1634cabbd36170faa5d0fcd39a1ab11101 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Tue, 24 Mar 2026 19:47:14 -0500
Subject: [PATCH 11/38] =?UTF-8?q?Fix=20dict-of-lists=20export=20bug=20and?=
 =?UTF-8?q?=20O(n=C2=B2)=20iterable=20export=20loop?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_export_obj_dict_of_lists (line 1313): np.array(n) where n is an integer
creates a 0-dimensional array, not a 1-D array of length n. Every
export_data({"col": [...]}) call would raise IndexError. Fixed to
np.empty(shape, ...).

_export_obj_iterable (lines 1358-1366): np.append inside a for loop
reallocates the entire array on every iteration — O(n²) for a column
of n rows. Replaced with list accumulation and a single np.array()
call at the end.

Add test_export_dict_of_lists and test_export_list to cover both paths
(previously untested, which is why the bug went undetected).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx          | 17 +++++++++--------
 spotfire/test/test_sbdf.py | 16 ++++++++++++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 28770f5..1e53146 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1310,7 +1310,7 @@ cdef _export_obj_dict_of_lists(dict obj):
         context = _ExportContext()
         context.set_valuetype_id(_export_infer_valuetype_from_type(obj[col], f"column '{col}'"))
         shape = len(obj[col])
-        values = np.array(shape, dtype=context.get_numpy_dtype())
+        values = np.empty(shape, dtype=context.get_numpy_dtype())
         for i in range(shape):
             if pd.isnull(obj[col][i]):
                 values[i] = context.get_numpy_na_value()
@@ -1355,16 +1355,17 @@ cdef _export_obj_iterable(obj, default_column_name):
 
     context = _ExportContext()
     context.set_valuetype_id(_export_infer_valuetype_from_type(obj, "list"))
-    values = np.empty(0, dtype=context.get_numpy_dtype())
-    invalids = np.empty(0, dtype="bool")
+    values_list = []
+    invalids_list = []
     for x in obj:
         if pd.isnull(x):
-            values = np.append(values, context.get_numpy_na_value())
-            invalids = np.append(invalids, True)
+            values_list.append(context.get_numpy_na_value())
+            invalids_list.append(True)
         else:
-            values = np.append(values, x)
-            invalids = np.append(invalids, False)
-    context.set_arrays(values, invalids)
+            values_list.append(x)
+            invalids_list.append(False)
+    context.set_arrays(np.array(values_list, dtype=context.get_numpy_dtype()),
+                       np.array(invalids_list, dtype="bool"))
 
     return {}, [default_column_name], [{}], [context]
 
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index b048ac5..5a5d2db 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -509,6 +509,22 @@ def test_image_pil(self):
         else:
             self.fail(f"Expected PNG bytes, got {type(val)}: {val!r}")
 
+    def test_export_dict_of_lists(self):
+        """Exporting a dict of lists should produce a valid SBDF file."""
+        data = {"ints": [1, 2, 3], "floats": [1.1, 2.2, 3.3], "strings": ["a", "b", "c"]}
+        result = self._roundtrip_dataframe(data)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result["ints"].dropna().astype(int).tolist(), [1, 2, 3])
+        self.assertAlmostEqual(result["floats"][0], 1.1)
+        self.assertEqual(result["strings"].tolist(), ["a", "b", "c"])
+
+    def test_export_list(self):
+        """Exporting a plain Python list should produce a single-column SBDF file."""
+        result = self._roundtrip_dataframe([10, 20, 30])
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result.columns[0], "x")
+        self.assertEqual(result["x"].dropna().astype(int).tolist(), [10, 20, 30])
+
     def test_export_import_unicode_path(self):
         """Test export and import with a Unicode file path."""
         dataframe = pd.DataFrame({"col": [1, 2, 3], "txt": ["a", "b", "c"]})

From aeae3ab8c282a9370342c6f8c70d548570410f2b Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Tue, 24 Mar 2026 20:06:10 -0500
Subject: [PATCH 12/38] Build nullable integer columns with mask in one shot on
 import

For Int32/Int64 columns, the previous code constructed a pd.Series and
then assigned nulls via .loc[mask] = None in a second pass, which
triggers Pandas dtype coercion overhead internally.

Replace with pd.arrays.IntegerArray(values, mask) which constructs the
nullable integer array with the validity mask in a single operation,
avoiding the second pass entirely.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 1e53146..2b7e94e 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -877,10 +877,19 @@ def import_data(sbdf_file, output_format="pandas"):
         # Build a new Pandas DataFrame with the results
         imported_columns = []
         for i in range(num_columns):
-            column_series = pd.Series(importer_contexts[i].get_values_array(),
-                                      dtype=importer_contexts[i].get_pandas_dtype_name(),
-                                      name=column_names[i])
-            column_series.loc[importer_contexts[i].get_invalid_array()] = None
+            values = importer_contexts[i].get_values_array()
+            invalid_array = importer_contexts[i].get_invalid_array()
+            dtype_name = importer_contexts[i].get_pandas_dtype_name()
+            if dtype_name in ("Int32", "Int64"):
+                # Build nullable integer array with mask in one shot; avoids a second-pass
+                # .loc assignment that triggers Pandas dtype coercion overhead.
+                base_dtype = "int32" if dtype_name == "Int32" else "int64"
+                column_series = pd.Series(
+                    pd.arrays.IntegerArray(values.astype(base_dtype), invalid_array),
+                    name=column_names[i])
+            else:
+                column_series = pd.Series(values, dtype=dtype_name, name=column_names[i])
+                column_series.loc[invalid_array] = None
             imported_columns.append(column_series)
         dataframe = pd.concat(imported_columns, axis=1)
         for i in range(num_columns):

From d1955dfcca08c10eefc3a2db4e413b1a5f90e119 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 12:00:27 -0500
Subject: [PATCH 13/38] Address review: metadata warnings, descriptive errors,
 and 1-copy datetime import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Emit SBDFWarning on both Polars import and export paths pointing to
  polars-rs/polars#5117 so metadata loss is never silent.
- Raise TypeError with a Polars-specific message from copy_metadata(),
  get_spotfire_types(), and set_spotfire_types() instead of a generic error.
- For the Polars import path, bypass the Python-boxing importers for
  DateTime/Date/TimeSpan: store raw int64 ms values via _import_vts_numpy,
  then in _import_build_polars_dataframe subtract the SBDF-to-Unix epoch
  offset in-place and reinterpret via .view() — reducing peak memory from
  3 live copies to 1-2 (down from creating Python datetime objects).
- String/Time/Binary/Decimal import: release the concatenated numpy array
  before building the Polars Arrow buffer (del + clear_values_arrays()) to
  cap peak at 2 live copies instead of 3.
- Add get_value_type_id() and clear_values_arrays() cpdef helpers on
  _ImportContext to support the above without Cython-level casts.
- Add 6 new tests covering the metadata warning and descriptive error paths.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/public.py         |  16 +++++
 spotfire/sbdf.pyx          | 131 ++++++++++++++++++++++++++++++++-----
 spotfire/test/test_sbdf.py |  43 ++++++++++++
 3 files changed, 172 insertions(+), 18 deletions(-)

diff --git a/spotfire/public.py b/spotfire/public.py
index bf13af4..2d1fd02 100644
--- a/spotfire/public.py
+++ b/spotfire/public.py
@@ -18,6 +18,16 @@
 
 _ColumnTypes = dict[str, str]
 
+_POLARS_METADATA_ERROR = (
+    "Polars DataFrames do not support Spotfire metadata; "
+    "see https://github.com/pola-rs/polars/issues/5117"
+)
+
+
+def _is_polars_type(obj) -> bool:
+    """Return True if obj is a Polars DataFrame or Series."""
+    return type(obj).__module__.startswith("polars")
+
 
 # Table and column metadata functions
 
@@ -28,6 +38,8 @@ def copy_metadata(source, destination) -> None:
     :param destination: the DataFrame or Series to copy metadata to
     :raise TypeError: if the types of source and destination do not match
     """
+    if _is_polars_type(source) or _is_polars_type(destination):
+        raise TypeError(_POLARS_METADATA_ERROR)
     # Verify that types of source and destination match
     if isinstance(source, pd.DataFrame) and not isinstance(destination, pd.DataFrame):
         raise TypeError("both source and destination must be DataFrames")
@@ -65,6 +77,8 @@ def get_spotfire_types(dataframe: pd.DataFrame) -> pd.Series:
     :param dataframe: the DataFrame to get the Spotfire types of
     :returns: a Series containing the Spotfire types of each column of dataframe
     """
+    if _is_polars_type(dataframe):
+        raise TypeError(_POLARS_METADATA_ERROR)
     if not isinstance(dataframe, pd.DataFrame):
         raise TypeError("dataframe is not a DataFrame")
     spotfire_types = {}
@@ -83,6 +97,8 @@ def set_spotfire_types(dataframe: pd.DataFrame, column_types: _ColumnTypes) -> N
     :param dataframe: the DataFrame to set the Spotfire types of
     :param column_types: dictionary that maps column names to column types
     """
+    if _is_polars_type(dataframe):
+        raise TypeError(_POLARS_METADATA_ERROR)
     if not isinstance(dataframe, pd.DataFrame):
         raise TypeError("dataframe is not a DataFrame")
     for col, spotfire_type in column_types.items():
diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 28770f5..b381f4a 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -110,6 +110,13 @@ cdef object _timedelta_from_msec(long long msec):
 cdef object _DATETIME_EPOCH = datetime.datetime(1, 1, 1)
 cdef object _TIMEDELTA_ONE_MSEC = _timedelta_from_msec(1)
 
+# Milliseconds between the SBDF epoch (datetime(1, 1, 1)) and the Unix epoch (datetime(1970, 1, 1)).
+# = 719162 days * 86400 s/day * 1000 ms/s, derived from:
+#   (datetime.datetime(1970, 1, 1) - datetime.datetime(1, 1, 1)).total_seconds() * 1000
+# Used in the Polars import path to convert raw SBDF int64 ms values to Unix-based int64 ms values
+# without boxing through Python datetime objects.
+cdef long long _SBDF_TO_UNIX_EPOCH_MS = 62135596800000
+
 
 cdef extern from *:
     """
@@ -437,6 +444,28 @@ cdef class _ImportContext:
         """
         return self.numpy_type_num == np_c.NPY_OBJECT
 
+    cpdef int get_value_type_id(self):
+        """Return the SBDF value type ID for this column.
+
+        :return: the integer SBDF value type ID
+
+        .. note:: ``value_type`` is a ``cdef`` C struct attribute inaccessible from Python.  This
+                  ``cpdef`` wrapper lets :func:`_import_build_polars_dataframe` dispatch on type
+                  without a Cython-level cast.
+        """
+        return self.value_type.id
+
+    cpdef void clear_values_arrays(self):
+        """Release the internal per-slice values arrays to allow early garbage collection.
+
+        Call this after :meth:`get_values_array` has produced the concatenated result and the
+        caller no longer needs the per-slice data.  Dropping these references makes the underlying
+        NPY_OBJECT (or NPY_INT64) slice arrays eligible for GC before the Polars Arrow buffer is
+        allocated, reducing peak memory from three live copies to two (or one, for types where
+        Polars can reference the numpy buffer directly).
+        """
+        self.values_arrays = []
+
 
 # Individual functions for importing each value type.
 ctypedef int(*importer_fn)(_ImportContext, sbdf_c.sbdf_columnslice*)
@@ -713,27 +742,67 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
     :param importer_contexts: list of _ImportContext objects
     :return: a Polars DataFrame
     """
+    warnings.warn(
+        "Polars DataFrames do not support Spotfire metadata; table and column metadata are not "
+        "preserved. See https://github.com/pola-rs/polars/issues/5117",
+        SBDFWarning
+    )
     series_list = []
     for i, name in enumerate(column_names):
         context = importer_contexts[i]
-        values = context.get_values_array()
         invalids = context.get_invalid_array()
-        polars_dtype = _import_polars_dtype(context)
+        vt_id = context.get_value_type_id()
+
+        if vt_id == sbdf_c.SBDF_DATETIMETYPEID:
+            # Raw int64 ms since SBDF epoch → subtract fixed offset → reinterpret as
+            # datetime64[ms].  All arithmetic is in-place on the concatenated array, so
+            # peak memory is: one int64 numpy array + the Polars Arrow buffer (2 copies,
+            # or 1 if Polars references the numpy buffer directly).
+            values = context.get_values_array()
+            context.clear_values_arrays()
+            values -= _SBDF_TO_UNIX_EPOCH_MS
+            col = pl.Series(name=name, values=values.view('datetime64[ms]'), dtype=pl.Datetime('ms'))
+            if invalids.any():
+                col = col.scatter(np.where(invalids)[0].tolist(), None)
+
+        elif vt_id == sbdf_c.SBDF_DATETYPEID:
+            # Same raw int64 ms path; divide down to days for pl.Date.
+            values = context.get_values_array()
+            context.clear_values_arrays()
+            values -= _SBDF_TO_UNIX_EPOCH_MS
+            values //= 86400000
+            col = pl.Series(name=name, values=values.view('datetime64[D]'), dtype=pl.Date)
+            if invalids.any():
+                col = col.scatter(np.where(invalids)[0].tolist(), None)
+
+        elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID:
+            # Timespans are already int64 ms with no epoch bias — reinterpret directly.
+            values = context.get_values_array()
+            context.clear_values_arrays()
+            col = pl.Series(name=name, values=values.view('timedelta64[ms]'), dtype=pl.Duration('ms'))
+            if invalids.any():
+                col = col.scatter(np.where(invalids)[0].tolist(), None)
+
+        elif not context.is_object_numpy_type():
+            # Numeric types (bool, int, float): numpy → Polars directly; Polars may zero-copy
+            # the buffer.  No early release needed — these arrays are small relative to the data.
+            values = context.get_values_array()
+            col = pl.Series(name=name, values=values, dtype=_import_polars_dtype(context))
+            if invalids.any():
+                col = col.scatter(np.where(invalids)[0].tolist(), None)
 
-        if context.is_object_numpy_type():
-            # Object arrays hold Python objects (str, date, datetime, etc.); Polars cannot
-            # construct a typed series from a numpy object array directly — use a Python list.
+        else:
+            # String, time, binary, decimal: Polars requires a Python list (no compatible numpy
+            # dtype).  Release the concatenated array before building the Arrow buffer to cap
+            # peak memory at 2 live copies (list + Arrow) instead of 3.
+            values = context.get_values_array()
             values_list = values.tolist()
+            context.clear_values_arrays()
+            del values
             if invalids.any():
                 for idx in np.where(invalids)[0]:
                     values_list[idx] = None
-            col = pl.Series(name=name, values=values_list, dtype=polars_dtype)
-        else:
-            # Numeric arrays: numpy → Polars Series directly, then scatter nulls if needed.
-            col = pl.Series(name=name, values=values, dtype=polars_dtype)
-            if invalids.any():
-                indices = np.where(invalids)[0].tolist()
-                col = col.scatter(indices, None)
+            col = pl.Series(name=name, values=values_list, dtype=_import_polars_dtype(context))
 
         series_list.append(col)
 
@@ -814,14 +883,30 @@ def import_data(sbdf_file, output_format="pandas"):
                 importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type))
                 importer_fns[i] = _import_vts_numpy
             elif col_type.id == sbdf_c.SBDF_DATETIMETYPEID:
-                importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
-                importer_fns[i] = _import_vt_datetime
+                if output_format == "polars":
+                    # Store raw int64 ms values; _import_build_polars_dataframe will adjust the
+                    # epoch offset and reinterpret as datetime64[ms] without boxing Python objects.
+                    importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
+                    importer_fns[i] = _import_vts_numpy
+                else:
+                    importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
+                    importer_fns[i] = _import_vt_datetime
             elif col_type.id == sbdf_c.SBDF_DATETYPEID:
-                importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
-                importer_fns[i] = _import_vt_date
+                if output_format == "polars":
+                    importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
+                    importer_fns[i] = _import_vts_numpy
+                else:
+                    importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
+                    importer_fns[i] = _import_vt_date
             elif col_type.id == sbdf_c.SBDF_TIMESPANTYPEID:
-                importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
-                importer_fns[i] = _import_vt_timespan
+                if output_format == "polars":
+                    # Timespans are stored as int64 ms with no epoch — reinterpret directly as
+                    # timedelta64[ms] in _import_build_polars_dataframe.
+                    importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
+                    importer_fns[i] = _import_vts_numpy
+                else:
+                    importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
+                    importer_fns[i] = _import_vt_timespan
             elif col_type.id == sbdf_c.SBDF_TIMETYPEID:
                 importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                 importer_fns[i] = _import_vt_time
@@ -1229,6 +1314,11 @@ cdef _export_obj_polars_dataframe(obj):
     :return: tuple containing dictionary of table metadata, list of column names, list of dictionaries of column
               metadata, and list of export context objects
     """
+    warnings.warn(
+        "Polars DataFrames do not support Spotfire metadata; the exported SBDF will not contain "
+        "table or column metadata. See https://github.com/pola-rs/polars/issues/5117",
+        SBDFWarning
+    )
     if len(set(obj.columns)) != len(obj.columns):
         raise SBDFError("obj does not have unique column names")
 
@@ -1259,6 +1349,11 @@ cdef _export_obj_polars_series(obj, default_column_name):
     :return: tuple containing dict of table metadata, list of column names, list of dicts of column metadata, and
               list of export context objects
     """
+    warnings.warn(
+        "Polars DataFrames do not support Spotfire metadata; the exported SBDF will not contain "
+        "table or column metadata. See https://github.com/pola-rs/polars/issues/5117",
+        SBDFWarning
+    )
     column_name = obj.name if obj.name else default_column_name
     description = f"series '{obj.name}'" if obj.name else "series"
 
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index b048ac5..22c9f10 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -709,3 +709,46 @@ def test_write_polars_float_nan(self):
         self.assertAlmostEqual(result["vals"][0], 1.0)
         self.assertTrue(pd.isnull(result["vals"][1]))
         self.assertAlmostEqual(result["vals"][2], 3.0)
+
+    # Metadata warning tests
+
+    def test_polars_import_meta_warning(self):
+        """import_data with output_format='polars' should warn that metadata is not preserved."""
+        with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
+            sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format="polars")
+
+    def test_polars_df_export_meta_warn(self):
+        """export_data with a Polars DataFrame should warn that metadata is not preserved."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/meta_warn.sbdf"
+            with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
+                sbdf.export_data(polars_df, path)
+
+    def test_polars_series_meta_export(self):
+        """export_data with a Polars Series should warn that metadata is not preserved."""
+        series = pl.Series("x", [1, 2, 3])
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/meta_warn_series.sbdf"
+            with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
+                sbdf.export_data(series, path)
+
+    # Metadata public-API error tests
+
+    def test_copy_metadata_polars_error(self):
+        """copy_metadata should raise TypeError with a Polars-specific message."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with self.assertRaisesRegex(TypeError, "Polars"):
+            spotfire.copy_metadata(polars_df, polars_df)
+
+    def test_get_types_polars_error(self):
+        """get_spotfire_types should raise TypeError with a Polars-specific message."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with self.assertRaisesRegex(TypeError, "Polars"):
+            spotfire.get_spotfire_types(polars_df)
+
+    def test_set_types_polars_error(self):
+        """set_spotfire_types should raise TypeError with a Polars-specific message."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with self.assertRaisesRegex(TypeError, "Polars"):
+            spotfire.set_spotfire_types(polars_df, {"x": "Integer"})

From 392d1818b32a662e2640864d65b66c4357e3fff5 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 12:14:04 -0500
Subject: [PATCH 14/38] Add arithmetic correctness test for Polars date epoch
 conversion

Verifies that the in-place epoch-shift + .view('datetime64[D]') path in
_import_build_polars_dataframe produces identical results to the reference
np.astype('datetime64[D]') conversion across six dates: the SBDF epoch
(0001-01-01), one day before and the day of the Unix epoch, one day after,
a recent date, and the maximum representable date (9999-12-31).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 22c9f10..99f7e6a 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -549,6 +549,7 @@ def _assert_is_png_image(self, expr: bytes) -> None:
 @unittest.skipIf(pl is None, "polars not installed")
 class SbdfPolarsTest(unittest.TestCase):
     """Unit tests for Polars DataFrame support in 'spotfire.sbdf' module."""
+    # pylint: disable=too-many-public-methods
 
     def test_write_polars_basic(self):
         """Exporting a Polars DataFrame with common types should produce a valid SBDF file."""
@@ -710,6 +711,42 @@ def test_write_polars_float_nan(self):
         self.assertTrue(pd.isnull(result["vals"][1]))
         self.assertAlmostEqual(result["vals"][2], 3.0)
 
+    # Date conversion correctness test
+
+    def test_date_view_equals_astype(self):
+        """The in-place epoch-shift + view conversion used in _import_build_polars_dataframe
+        should produce the same datetime64[D] values as the reference astype() path for a
+        range of dates spanning the SBDF epoch, dates before the Unix epoch, the Unix epoch
+        itself, a recent date, and the maximum representable date."""
+        sbdf_epoch_ms = 62135596800000  # ms from datetime(1,1,1) to datetime(1970,1,1)
+        test_dates = [
+            datetime.date(1, 1, 1),      # SBDF epoch — largest negative offset from Unix
+            datetime.date(1969, 12, 31), # one day before Unix epoch
+            datetime.date(1970, 1, 1),   # Unix epoch — must give day 0
+            datetime.date(1970, 1, 2),   # one day after Unix epoch
+            datetime.date(2024, 1, 15),  # arbitrary recent date
+            datetime.date(9999, 12, 31), # maximum Python date
+        ]
+        for test_date in test_dates:
+            # Reproduce the raw SBDF int64 value exactly as the C importer would produce it.
+            sbdf_ms = int(
+                (test_date - datetime.date(1, 1, 1)) / datetime.timedelta(milliseconds=1)
+            )
+            arr = np.array([sbdf_ms], dtype=np.int64)
+
+            # Apply the same in-place conversion used in _import_build_polars_dataframe.
+            arr -= sbdf_epoch_ms
+            arr //= 86400000
+            view_result = arr.view('datetime64[D]')[0]
+
+            # Reference: convert the Python date directly via astype.
+            ref_result = np.array([test_date], dtype=object).astype('datetime64[D]')[0]
+
+            self.assertEqual(
+                view_result, ref_result,
+                msg=f"Mismatch for {test_date}: view={view_result}, astype={ref_result}"
+            )
+
     # Metadata warning tests
 
     def test_polars_import_meta_warning(self):

From 93f0b0b4f32ecd12601424554c9b4cd404ecc5dc Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 12:25:23 -0500
Subject: [PATCH 15/38] Fix Polars temporal import to be genuinely zero-copy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous view('datetime64[ms]') approach always triggered a copy inside
Polars: _normalise_numpy_dtype() unconditionally calls .astype(np.int64) on
any datetime64 input before passing to the Rust constructor.

Verified via mutation test (numpy array modified after Series construction):
- Datetime: pl.Series(int64, Int64).cast(Datetime('ms')) — zero-copy; Int64
  and Datetime('ms') share the same int64 Arrow buffer (metadata-only cast).
- Duration: pl.Series(int64, Int64).cast(Duration('ms')) — same, zero-copy.
- Date: pl.Date is int32 internally, so int64→int32 narrowing is unavoidable
  (1 copy via .astype(np.int32)); pl.Series(int32, Date) is then zero-copy.
  Total: 2 copies from C data (down from 3 in the original NPY_OBJECT path).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index b381f4a..16cb407 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -754,32 +754,36 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
         vt_id = context.get_value_type_id()
 
         if vt_id == sbdf_c.SBDF_DATETIMETYPEID:
-            # Raw int64 ms since SBDF epoch → subtract fixed offset → reinterpret as
-            # datetime64[ms].  All arithmetic is in-place on the concatenated array, so
-            # peak memory is: one int64 numpy array + the Polars Arrow buffer (2 copies,
-            # or 1 if Polars references the numpy buffer directly).
+            # Raw int64 ms since SBDF epoch → subtract fixed offset → Int64 Series →
+            # cast to Datetime('ms').  Polars' cast between Int64 and Datetime('ms') is a
+            # zero-copy metadata operation (both are int64 internally in Arrow), so the
+            # Series shares the same buffer as the numpy array: 1 copy total from C data.
             values = context.get_values_array()
             context.clear_values_arrays()
             values -= _SBDF_TO_UNIX_EPOCH_MS
-            col = pl.Series(name=name, values=values.view('datetime64[ms]'), dtype=pl.Datetime('ms'))
+            col = pl.Series(name=name, values=values, dtype=pl.Int64).cast(pl.Datetime('ms'))
             if invalids.any():
                 col = col.scatter(np.where(invalids)[0].tolist(), None)
 
         elif vt_id == sbdf_c.SBDF_DATETYPEID:
-            # Same raw int64 ms path; divide down to days for pl.Date.
+            # Same raw int64 ms path; divide down to days, then narrow to int32.
+            # pl.Date is stored as int32 days since Unix epoch in Arrow, so the int64→int32
+            # narrowing is unavoidable (1 copy).  pl.Series(int32, pl.Date) is then
+            # zero-copy: 2 copies total from C data.
             values = context.get_values_array()
             context.clear_values_arrays()
             values -= _SBDF_TO_UNIX_EPOCH_MS
             values //= 86400000
-            col = pl.Series(name=name, values=values.view('datetime64[D]'), dtype=pl.Date)
+            col = pl.Series(name=name, values=values.astype(np.int32), dtype=pl.Date)
             if invalids.any():
                 col = col.scatter(np.where(invalids)[0].tolist(), None)
 
         elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID:
-            # Timespans are already int64 ms with no epoch bias — reinterpret directly.
+            # Timespans are int64 ms with no epoch bias.  Duration('ms') is int64 in Arrow,
+            # so the cast is zero-copy: 1 copy total from C data.
             values = context.get_values_array()
             context.clear_values_arrays()
-            col = pl.Series(name=name, values=values.view('timedelta64[ms]'), dtype=pl.Duration('ms'))
+            col = pl.Series(name=name, values=values, dtype=pl.Int64).cast(pl.Duration('ms'))
             if invalids.any():
                 col = col.scatter(np.where(invalids)[0].tolist(), None)
 

From a20782b8dd728bc49bf748d1e56be33c285c697f Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 12:39:23 -0500
Subject: [PATCH 16/38] =?UTF-8?q?Perf:=20convert=20Date=20ms=E2=86=92days?=
 =?UTF-8?q?=20as=20int32=20at=20C=20level=20(1=20copy=20instead=20of=202)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_import_vt_date_int32 writes directly into an NPY_INT32 slice array at the
C level, so pl.Series(int32, pl.Date) in _import_build_polars_dataframe is
then zero-copy — eliminating the prior int64→int32 astype() narrowing copy.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 16cb407..bdf52e7 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -517,6 +517,31 @@ cdef int _import_vt_date(_ImportContext context, sbdf_c.sbdf_columnslice* col_sl
     return error
 
 
+cdef int _import_vt_date_int32(_ImportContext context, sbdf_c.sbdf_columnslice* col_slice):
+    """Import a date column slice as int32 days since Unix epoch (Polars path only).
+
+    Converts the raw SBDF int64 millisecond values to int32 days at the C level, writing
+    directly into an NPY_INT32 slice.  This avoids an intermediate int64 array and the
+    subsequent astype(np.int32) copy, reducing total allocations from C data to one.
+
+    SBDF dates are always stored at midnight (exact multiples of 86400000 ms), so C
+    integer division equals Python floor division for both positive and negative offsets.
+    """
+    cdef int error
+    (error, values, invalid) = context.get_values_and_invalid(col_slice)
+    cdef long long* data
+    cdef int i
+    if error == sbdf_c.SBDF_OK:
+        values_slice = context.new_slice_from_empty(values.count)
+        data = <long long*>values.data
+        for i in range(values.count):
+            values_slice[i] = <int>((data[i] - _SBDF_TO_UNIX_EPOCH_MS) / 86400000)
+        invalid_slice = context.new_slice_from_invalid(values.count, invalid)
+        context.append_values_slice(values_slice, invalid_slice)
+        context.cleanup_values_and_invalid(values, invalid)
+    return error
+
+
 cdef int _import_vt_time(_ImportContext context, sbdf_c.sbdf_columnslice* col_slice):
     """Import a column slice consisting of time values."""
     cdef int error
@@ -766,15 +791,11 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
                 col = col.scatter(np.where(invalids)[0].tolist(), None)
 
         elif vt_id == sbdf_c.SBDF_DATETYPEID:
-            # Same raw int64 ms path; divide down to days, then narrow to int32.
-            # pl.Date is stored as int32 days since Unix epoch in Arrow, so the int64→int32
-            # narrowing is unavoidable (1 copy).  pl.Series(int32, pl.Date) is then
-            # zero-copy: 2 copies total from C data.
+            # _import_vt_date_int32 already converted ms→days and wrote int32 directly.
+            # pl.Series(int32, pl.Date) is zero-copy: 1 copy total from C data.
             values = context.get_values_array()
             context.clear_values_arrays()
-            values -= _SBDF_TO_UNIX_EPOCH_MS
-            values //= 86400000
-            col = pl.Series(name=name, values=values.astype(np.int32), dtype=pl.Date)
+            col = pl.Series(name=name, values=values, dtype=pl.Date)
             if invalids.any():
                 col = col.scatter(np.where(invalids)[0].tolist(), None)
 
@@ -897,8 +918,8 @@ def import_data(sbdf_file, output_format="pandas"):
                     importer_fns[i] = _import_vt_datetime
             elif col_type.id == sbdf_c.SBDF_DATETYPEID:
                 if output_format == "polars":
-                    importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
-                    importer_fns[i] = _import_vts_numpy
+                    importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type))
+                    importer_fns[i] = _import_vt_date_int32
                 else:
                     importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                     importer_fns[i] = _import_vt_date

From 5a90fe7d14db6fcb03fc59b220a7770cc9bfb462 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 13:07:09 -0500
Subject: [PATCH 17/38] Perf: zero-copy Polars export for temporal/numeric
 types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For null-free numeric columns, skip fill_null and use to_numpy(allow_copy=False)
to return a direct view of the Arrow buffer.

For Datetime/Date/Duration/Time, extract raw integer buffers from the Polars Series
(zero-copy when null-free) and route through four new Polars-specific C-level
exporter functions that perform epoch/unit conversion in a tight C loop, completely
bypassing the Python-object-boxing loop in the generic exporters:
- _export_vt_polars_datetime: int64 ms (Unix) → add SBDF epoch offset
- _export_vt_polars_date: int32 days → int64 ms (SBDF epoch)
- _export_vt_polars_timespan: int64 ms passthrough (no epoch needed)
- _export_vt_polars_time: int64 ns → int64 ms

Columns with nulls fall back to a fill-zero copy (Arrow's validity bitmap cannot
be expressed inline in a numpy int array), but are still processed by the C loop.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 192 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 168 insertions(+), 24 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index bdf52e7..142c7fb 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1042,6 +1042,7 @@ cdef class _ExportContext:
     cdef np_c.ndarray values_array
     cdef np_c.ndarray invalid_array
     cdef bint any_invalid
+    cdef int polars_exporter_id  # 0=default; 1=datetime; 2=date; 3=timespan; 4=time
 
     def __init__(self):
         """Initialize the export context."""
@@ -1049,6 +1050,7 @@ cdef class _ExportContext:
         self.values_array = None
         self.invalid_array = None
         self.any_invalid = False
+        self.polars_exporter_id = 0
 
     cdef void set_arrays(self, np_c.ndarray values, invalid):
         """Set the NumPy ``ndarray`` with the values to export and a list or NumPy ``ndarray`` of whether each value
@@ -1081,6 +1083,13 @@ cdef class _ExportContext:
         """
         return self.valuetype_id
 
+    cpdef int get_polars_exporter_id(self):
+        """Get the Polars-specific exporter ID (0 = use default exporter).
+
+        :return: 0 default; 1 datetime; 2 date; 3 timespan; 4 time
+        """
+        return self.polars_exporter_id
+
     def get_numpy_dtype(self):
         """Get the correct NumPy dtype for this column.
 
@@ -1299,11 +1308,16 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
         raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")
 
 
-cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series):
-    """Convert a Polars Series to a NumPy array suitable for the SBDF exporter.
+cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series,
+                                                  np_c.ndarray invalids):
+    """Convert a non-temporal Polars Series to a NumPy array for the SBDF exporter.
+
+    Temporal types (Datetime, Date, Duration, Time) are handled by
+    ``_export_polars_setup_arrays`` before this function is reached.
 
     :param context: export context holding the resolved value type
-    :param series: Polars Series to convert
+    :param series: Polars Series to convert (non-temporal)
+    :param invalids: boolean NumPy array marking which rows are null/NaN
     :return: NumPy ndarray of values
     """
     dtype_name = series.dtype.__class__.__name__
@@ -1316,18 +1330,22 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series)
         # Cast to String so .to_numpy() returns plain Python strings
         series = series.cast(pl.Utf8)
         dtype_name = "Utf8"
-    if dtype_name in ("Date", "Time"):
-        # The Date/Time exporters require Python date/time objects;
-        # Polars .to_numpy() returns numpy datetime64/int64 which those exporters do not accept.
-        return np.asarray(series.to_list(), dtype=object)
-    if dtype_name in ("Datetime", "Duration"):
-        # Keep native datetime64/timedelta64 arrays; the invalids mask handles nulls (NaT cells
-        # are marked invalid and ignored by the SBDF writer).  Boxing to object would be slower.
-        return series.to_numpy(allow_copy=True)
     na_value = context.get_numpy_na_value()
     if na_value is not None:
-        return np.asarray(series.fill_null(na_value).to_numpy(allow_copy=True),
-                          dtype=context.get_numpy_dtype())
+        # Numeric / boolean column.  Skip fill_null when the series is null-free:
+        # to_numpy(allow_copy=False) returns a zero-copy view of the Arrow buffer.
+        # Fall back to fill_null+copy when nulls are present (Arrow's validity bitmap
+        # cannot be expressed inline in a numpy array for integer/boolean dtypes).
+        if invalids.any():
+            return np.asarray(series.fill_null(na_value).to_numpy(allow_copy=True),
+                              dtype=context.get_numpy_dtype())
+        else:
+            try:
+                return np.asarray(series.to_numpy(allow_copy=False),
+                                  dtype=context.get_numpy_dtype())
+            except Exception:
+                return np.asarray(series.to_numpy(allow_copy=True),
+                                  dtype=context.get_numpy_dtype())
     else:
         return np.asarray(series.to_numpy(allow_copy=True), dtype=object)
 
@@ -1355,11 +1373,7 @@ cdef _export_obj_polars_dataframe(obj):
         column_names.append(col)
         context = _ExportContext()
         context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(series.dtype, f"column '{col}'"))
-        if series.dtype.__class__.__name__ in ("Float32", "Float64"):
-            invalids = (series.is_null() | series.is_nan()).to_numpy()
-        else:
-            invalids = series.is_null().to_numpy()
-        context.set_arrays(_export_polars_series_to_numpy(context, series), invalids)
+        _export_polars_setup_arrays(context, series)
         column_metadata.append({})
         exporter_contexts.append(context)
 
@@ -1384,11 +1398,7 @@ cdef _export_obj_polars_series(obj, default_column_name):
 
     context = _ExportContext()
     context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(obj.dtype, description))
-    if obj.dtype.__class__.__name__ in ("Float32", "Float64"):
-        invalids = (obj.is_null() | obj.is_nan()).to_numpy()
-    else:
-        invalids = obj.is_null().to_numpy()
-    context.set_arrays(_export_polars_series_to_numpy(context, obj), invalids)
+    _export_polars_setup_arrays(context, obj)
 
     return {}, [column_name], [{}], [context]
 
@@ -1602,6 +1612,130 @@ cdef exporter_fn _export_get_exporter(int valuetype_id):
         return _export_vt_decimal
 
 
+cdef np_c.ndarray _polars_temporal_to_numpy(series):
+    """Return a raw-integer NumPy array from a Polars integer Series, zero-copy when possible.
+
+    ``series`` must already be cast to the target integer type (Int32 or Int64).
+    Zero-copy succeeds for null-free series; falls back to a fill-zero copy when nulls
+    are present (Polars cannot expose the Arrow validity bitmap inline in a numpy view
+    for integer types).  The zeroed values at null positions are never read by Spotfire
+    because the SBDF invalids array marks those rows as missing.
+    """
+    try:
+        return series.to_numpy(allow_copy=False)
+    except Exception:
+        return series.to_numpy(allow_copy=True)
+
+
+cdef void _export_polars_setup_arrays(_ExportContext context, series):
+    """Populate context arrays and polars_exporter_id for a Polars Series.
+
+    For temporal types, extracts raw integer buffers (zero-copy when the series has no
+    nulls) and selects a dedicated C-level exporter that performs the epoch / unit
+    conversion without boxing Python objects.  For all other types, delegates to
+    ``_export_polars_series_to_numpy``.
+    """
+    dtype_name = series.dtype.__class__.__name__
+    if dtype_name in ("Float32", "Float64"):
+        invalids = (series.is_null() | series.is_nan()).to_numpy()
+    else:
+        invalids = series.is_null().to_numpy()
+
+    if dtype_name == "Datetime":
+        # Normalise to ms precision for SBDF; cast Datetime('ms')→Int64 is zero-copy.
+        if getattr(series.dtype, 'time_unit', 'ms') != 'ms':
+            raw = series.cast(pl.Datetime('ms')).cast(pl.Int64)
+        else:
+            raw = series.cast(pl.Int64)
+        context.set_arrays(_polars_temporal_to_numpy(raw), invalids)
+        context.polars_exporter_id = 1
+    elif dtype_name == "Duration":
+        if getattr(series.dtype, 'time_unit', 'ms') != 'ms':
+            raw = series.cast(pl.Duration('ms')).cast(pl.Int64)
+        else:
+            raw = series.cast(pl.Int64)
+        context.set_arrays(_polars_temporal_to_numpy(raw), invalids)
+        context.polars_exporter_id = 3
+    elif dtype_name == "Date":
+        # Date is always int32 days since Unix epoch in Arrow.
+        context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int32)), invalids)
+        context.polars_exporter_id = 2
+    elif dtype_name == "Time":
+        # Time is always int64 ns since midnight in Arrow.
+        context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int64)), invalids)
+        context.polars_exporter_id = 4
+    else:
+        context.set_arrays(_export_polars_series_to_numpy(context, series, invalids), invalids)
+
+
+cdef int _export_vt_polars_datetime(_ExportContext context, Py_ssize_t start, Py_ssize_t count,
+                                    sbdf_c.sbdf_object** obj):
+    """Export a Polars Datetime column.
+
+    ``values_array`` holds int64 ms since the Unix epoch.  Adds the fixed SBDF-to-Unix
+    offset in a tight C loop across all positions; null positions are zeroed in the
+    input by Polars and are ignored by Spotfire via the SBDF invalids array.
+    """
+    cdef np_c.npy_intp shape[1]
+    shape[0] = <np_c.npy_intp>count
+    cdef np_c.ndarray out = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0)
+    cdef long long* src = <long long*>np_c.PyArray_DATA(context.values_array)
+    cdef long long* dst = <long long*>np_c.PyArray_DATA(out)
+    cdef Py_ssize_t i
+    for i in range(count):
+        dst[i] = src[start + i] + _SBDF_TO_UNIX_EPOCH_MS
+    return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_datetime(), <int>count, np_c.PyArray_DATA(out), NULL, obj)
+
+
+cdef int _export_vt_polars_date(_ExportContext context, Py_ssize_t start, Py_ssize_t count,
+                                sbdf_c.sbdf_object** obj):
+    """Export a Polars Date column.
+
+    ``values_array`` holds int32 days since the Unix epoch.  Converts each value to
+    int64 ms since the SBDF epoch in a tight C loop.
+    """
+    cdef np_c.npy_intp shape[1]
+    shape[0] = <np_c.npy_intp>count
+    cdef np_c.ndarray out = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0)
+    cdef int* src = <int*>np_c.PyArray_DATA(context.values_array)
+    cdef long long* dst = <long long*>np_c.PyArray_DATA(out)
+    cdef Py_ssize_t i
+    for i in range(count):
+        dst[i] = (<long long>src[start + i]) * 86400000 + _SBDF_TO_UNIX_EPOCH_MS
+    return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_date(), <int>count, np_c.PyArray_DATA(out), NULL, obj)
+
+
+cdef int _export_vt_polars_timespan(_ExportContext context, Py_ssize_t start, Py_ssize_t count,
+                                    sbdf_c.sbdf_object** obj):
+    """Export a Polars Duration column.
+
+    ``values_array`` holds int64 ms.  SBDF TimeSpan is also int64 ms with no epoch
+    bias, so the Arrow buffer can be sliced and passed directly to the C writer without
+    any per-element loop.
+    """
+    return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_timespan(), <int>count,
+                                      _export_get_offset_ptr(context.values_array, start, count),
+                                      NULL, obj)
+
+
+cdef int _export_vt_polars_time(_ExportContext context, Py_ssize_t start, Py_ssize_t count,
+                                sbdf_c.sbdf_object** obj):
+    """Export a Polars Time column.
+
+    ``values_array`` holds int64 ns since midnight (Polars / Arrow internal format).
+    Converts to int64 ms for SBDF in a tight C loop.
+    """
+    cdef np_c.npy_intp shape[1]
+    shape[0] = <np_c.npy_intp>count
+    cdef np_c.ndarray out = np_c.PyArray_ZEROS(1, shape, np_c.NPY_INT64, 0)
+    cdef long long* src = <long long*>np_c.PyArray_DATA(context.values_array)
+    cdef long long* dst = <long long*>np_c.PyArray_DATA(out)
+    cdef Py_ssize_t i
+    for i in range(count):
+        dst[i] = src[start + i] // 1000000
+    return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_time(), <int>count, np_c.PyArray_DATA(out), NULL, obj)
+
+
 cdef int _export_vt_bool(_ExportContext context, Py_ssize_t start, Py_ssize_t count, sbdf_c.sbdf_object** obj):
     """Export a slice of data consisting of boolean values."""
     cdef np_c.ndarray values
@@ -2261,7 +2395,17 @@ def export_data(obj, sbdf_file, default_column_name="x", Py_ssize_t rows_per_sli
             for i in range(num_columns):
                 values = NULL
                 context = exporter_contexts[i]
-                exporter = _export_get_exporter(context.get_valuetype_id())
+                pol_id = context.get_polars_exporter_id()
+                if pol_id == 1:
+                    exporter = _export_vt_polars_datetime
+                elif pol_id == 2:
+                    exporter = _export_vt_polars_date
+                elif pol_id == 3:
+                    exporter = _export_vt_polars_timespan
+                elif pol_id == 4:
+                    exporter = _export_vt_polars_time
+                else:
+                    exporter = _export_get_exporter(context.get_valuetype_id())
                 error = exporter(context, row_offset, rows_per_slice, &values)
                 if error != sbdf_c.SBDF_OK:
                     raise SBDFError(f"error exporting column '{column_names[i]}': "

From e285cc3a98eab5d3b9371ce1c29b77114d8ef93e Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 13:14:16 -0500
Subject: [PATCH 18/38] Fix: remove illegal implementation signature from
 sbdf.pyi stub

Stub files must not contain a concrete @overload implementation alongside
the overload variants; mypy rejects it with 'An implementation for an
overloaded function is not allowed in a stub file'.  Remove the offending
line, leaving only the two typed overloads.

Also suppress call-overload at the one test site that intentionally passes
an invalid output_format value to exercise the SBDFError path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyi          | 1 -
 spotfire/test/test_sbdf.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/spotfire/sbdf.pyi b/spotfire/sbdf.pyi
index 9bd2812..55aeafc 100644
--- a/spotfire/sbdf.pyi
+++ b/spotfire/sbdf.pyi
@@ -17,6 +17,5 @@ def spotfire_typename_to_valuetype_id(typename: str) -> typing.Optional[int]: ..
 def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["pandas"] = ...) -> pd.DataFrame: ...
 @typing.overload
 def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["polars"]) -> typing.Any: ...
-def import_data(sbdf_file: _FilenameLike, output_format: str = "pandas") -> typing.Any: ...
 def export_data(obj: typing.Any, sbdf_file: _FilenameLike, default_column_name: str = "x",
                 rows_per_slice: int = 0, encoding_rle: bool = True) -> None: ...
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 99f7e6a..3174680 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -655,7 +655,7 @@ def test_invalid_output_format(self):
             path = f"{tempdir}/output.sbdf"
             sbdf.export_data(polars_df, path)
             with self.assertRaises(sbdf.SBDFError):
-                sbdf.import_data(path, output_format="numpy")
+                sbdf.import_data(path, output_format="numpy")  # type: ignore[call-overload]
 
     def test_write_polars_empty(self):
         """Exporting an empty Polars DataFrame should produce a valid (empty) SBDF file."""

From 71fd4c373b244ec401fb23e41622bc9f5c730ab5 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 13:46:38 -0500
Subject: [PATCH 19/38] Fix: zero null positions before pl.Series(pl.Time)
 construction on import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SBDF null slots may contain sentinel values (e.g. INT64_MAX) which,
after the ms→ns ×1_000_000 scale in _import_vt_time_int64, exceed
Polars' valid Time range [0, 86_400_000_000_000 ns].  Zero them out
before passing the int64 buffer to pl.Series(dtype=pl.Time); the
invalids array then overwrites those slots with None.

Also adds OutputFormat enum, cython-lint-friendly named export
constants, and fixes the sbdf.pyi stub to use TYPE_CHECKING guard.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/__init__.py |  1 +
 spotfire/sbdf.pyi    | 11 +++++-
 spotfire/sbdf.pyx    | 90 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/spotfire/__init__.py b/spotfire/__init__.py
index 6934d82..4d9e161 100644
--- a/spotfire/__init__.py
+++ b/spotfire/__init__.py
@@ -5,3 +5,4 @@
 """User visible utility functions."""
 
 from spotfire.public import copy_metadata, get_spotfire_types, set_spotfire_types, set_geocoding_table
+from spotfire.sbdf import OutputFormat
diff --git a/spotfire/sbdf.pyi b/spotfire/sbdf.pyi
index 55aeafc..853e9de 100644
--- a/spotfire/sbdf.pyi
+++ b/spotfire/sbdf.pyi
@@ -2,20 +2,29 @@
 # This file is subject to the license terms contained
 # in the license file that is distributed with this file.
 
+import enum
 import typing
 
 import pandas as pd
 
+if typing.TYPE_CHECKING:
+    import polars as pl
+
 
 _FilenameLike = typing.Union[str, bytes, int]
 
 class SBDFError(Exception): ...
 class SBDFWarning(Warning): ...
 
+class OutputFormat(str, enum.Enum):
+    """Supported output formats for :func:`import_data`."""
+    PANDAS: str
+    POLARS: str
+
 def spotfire_typename_to_valuetype_id(typename: str) -> typing.Optional[int]: ...
 @typing.overload
 def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["pandas"] = ...) -> pd.DataFrame: ...
 @typing.overload
-def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["polars"]) -> typing.Any: ...
+def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["polars"]) -> "pl.DataFrame": ...
 def export_data(obj: typing.Any, sbdf_file: _FilenameLike, default_column_name: str = "x",
                 rows_per_slice: int = 0, encoding_rle: bool = True) -> None: ...
diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 142c7fb..c37f4da 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -83,6 +83,17 @@ class SBDFWarning(Warning):
     """A warning that is raised to indicate an issue during import or export of SBDF files."""
 
 
+import enum
+
+class OutputFormat(str, enum.Enum):
+    """Supported output formats for :func:`import_data`.
+
+    Using this enum is preferred over passing raw strings, though both are accepted.
+    """
+    PANDAS = "pandas"
+    POLARS = "polars"
+
+
 # Utility functions and definitions for managing data types
 cdef extern from *:
     """
@@ -542,6 +553,28 @@ cdef int _import_vt_date_int32(_ImportContext context, sbdf_c.sbdf_columnslice*
     return error
 
 
+cdef int _import_vt_time_int64(_ImportContext context, sbdf_c.sbdf_columnslice* col_slice):
+    """Import a time column slice as int64 ns since midnight (Polars path only).
+
+    SBDF Time values are stored as int64 milliseconds since midnight.  Polars Time is
+    stored as int64 nanoseconds since midnight internally in Arrow, so each value is
+    multiplied by 1,000,000.  pl.Series(int64, pl.Time) then wraps the buffer zero-copy.
+    """
+    cdef int error
+    (error, values, invalid) = context.get_values_and_invalid(col_slice)
+    cdef long long* data
+    cdef Py_ssize_t i
+    if error == sbdf_c.SBDF_OK:
+        values_slice = context.new_slice_from_empty(values.count)
+        data = <long long*>values.data
+        for i in range(values.count):
+            values_slice[i] = data[i] * 1000000
+        invalid_slice = context.new_slice_from_invalid(values.count, invalid)
+        context.append_values_slice(values_slice, invalid_slice)
+        context.cleanup_values_and_invalid(values, invalid)
+    return error
+
+
 cdef int _import_vt_time(_ImportContext context, sbdf_c.sbdf_columnslice* col_slice):
     """Import a column slice consisting of time values."""
     cdef int error
@@ -808,6 +841,21 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             if invalids.any():
                 col = col.scatter(np.where(invalids)[0].tolist(), None)
 
+        elif vt_id == sbdf_c.SBDF_TIMETYPEID:
+            # _import_vt_time_int64 stores int64 ns since midnight (Polars Time internal format).
+            # pl.Series(int64, pl.Time) validates every element, including null positions.
+            # SBDF null slots may contain sentinel values (e.g. INT64_MAX) which, after the
+            # ×1_000_000 ms→ns scale, exceed the valid Time range [0, 86_400_000_000_000 ns].
+            # Zero them out before constructing the Series so validation passes; the invalids
+            # array then overwrites those slots with None immediately after.
+            values = context.get_values_array()
+            context.clear_values_arrays()
+            if invalids.any():
+                values[invalids] = 0
+            col = pl.Series(name=name, values=values, dtype=pl.Time)
+            if invalids.any():
+                col = col.scatter(np.where(invalids)[0].tolist(), None)
+
         elif not context.is_object_numpy_type():
             # Numeric types (bool, int, float): numpy → Polars directly; Polars may zero-copy
             # the buffer.  No early release needed — these arrays are small relative to the data.
@@ -933,8 +981,12 @@ def import_data(sbdf_file, output_format="pandas"):
                     importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                     importer_fns[i] = _import_vt_timespan
             elif col_type.id == sbdf_c.SBDF_TIMETYPEID:
-                importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
-                importer_fns[i] = _import_vt_time
+                if output_format == "polars":
+                    importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
+                    importer_fns[i] = _import_vt_time_int64
+                else:
+                    importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
+                    importer_fns[i] = _import_vt_time
             elif col_type.id == sbdf_c.SBDF_STRINGTYPEID:
                 importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                 importer_fns[i] = _import_vt_string
@@ -1034,6 +1086,14 @@ def import_data(sbdf_file, output_format="pandas"):
             mem.PyMem_RawFree(importer_fns)
 
 
+# Polars-specific exporter IDs stored in _ExportContext.polars_exporter_id.
+# Using C-level constants avoids Python object lookup in the hot export loop.
+cdef int _POL_EXP_DEFAULT = 0
+cdef int _POL_EXP_DATETIME = 1
+cdef int _POL_EXP_DATE = 2
+cdef int _POL_EXP_TIMESPAN = 3
+cdef int _POL_EXP_TIME = 4
+
 # Export data to SBDF from Python.
 @cython.auto_pickle(False)
 cdef class _ExportContext:
@@ -1343,7 +1403,10 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series,
             try:
                 return np.asarray(series.to_numpy(allow_copy=False),
                                   dtype=context.get_numpy_dtype())
-            except Exception:
+            except (pl.exceptions.InvalidOperationError, RuntimeError):
+            # Polars raises InvalidOperationError (older versions) or RuntimeError (1.x+) when
+            # allow_copy=False cannot be honoured (e.g., series contains nulls).  Both are caught
+            # so the fallback copy path works across Polars versions.
                 return np.asarray(series.to_numpy(allow_copy=True),
                                   dtype=context.get_numpy_dtype())
     else:
@@ -1623,7 +1686,10 @@ cdef np_c.ndarray _polars_temporal_to_numpy(series):
     """
     try:
         return series.to_numpy(allow_copy=False)
-    except Exception:
+    except (pl.exceptions.InvalidOperationError, RuntimeError):
+            # Polars raises InvalidOperationError (older versions) or RuntimeError (1.x+) when
+            # allow_copy=False cannot be honoured (e.g., series contains nulls).  Both are caught
+            # so the fallback copy path works across Polars versions.
         return series.to_numpy(allow_copy=True)
 
 
@@ -1648,22 +1714,22 @@ cdef void _export_polars_setup_arrays(_ExportContext context, series):
         else:
             raw = series.cast(pl.Int64)
         context.set_arrays(_polars_temporal_to_numpy(raw), invalids)
-        context.polars_exporter_id = 1
+        context.polars_exporter_id = _POL_EXP_DATETIME
     elif dtype_name == "Duration":
         if getattr(series.dtype, 'time_unit', 'ms') != 'ms':
             raw = series.cast(pl.Duration('ms')).cast(pl.Int64)
         else:
             raw = series.cast(pl.Int64)
         context.set_arrays(_polars_temporal_to_numpy(raw), invalids)
-        context.polars_exporter_id = 3
+        context.polars_exporter_id = _POL_EXP_TIMESPAN
     elif dtype_name == "Date":
         # Date is always int32 days since Unix epoch in Arrow.
         context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int32)), invalids)
-        context.polars_exporter_id = 2
+        context.polars_exporter_id = _POL_EXP_DATE
     elif dtype_name == "Time":
         # Time is always int64 ns since midnight in Arrow.
         context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int64)), invalids)
-        context.polars_exporter_id = 4
+        context.polars_exporter_id = _POL_EXP_TIME
     else:
         context.set_arrays(_export_polars_series_to_numpy(context, series, invalids), invalids)
 
@@ -2396,13 +2462,13 @@ def export_data(obj, sbdf_file, default_column_name="x", Py_ssize_t rows_per_sli
                 values = NULL
                 context = exporter_contexts[i]
                 pol_id = context.get_polars_exporter_id()
-                if pol_id == 1:
+                if pol_id == _POL_EXP_DATETIME:
                     exporter = _export_vt_polars_datetime
-                elif pol_id == 2:
+                elif pol_id == _POL_EXP_DATE:
                     exporter = _export_vt_polars_date
-                elif pol_id == 3:
+                elif pol_id == _POL_EXP_TIMESPAN:
                     exporter = _export_vt_polars_timespan
-                elif pol_id == 4:
+                elif pol_id == _POL_EXP_TIME:
                     exporter = _export_vt_polars_time
                 else:
                     exporter = _export_get_exporter(context.get_valuetype_id())

From 04144571c7e659786603a6263bae57ddd249cf71 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 13:56:35 -0500
Subject: [PATCH 20/38] Fix: add polars to mypy ignore_missing_imports
 overrides

polars is an optional dependency not installed in the CI lint
environment; the TYPE_CHECKING guard in sbdf.pyi is sufficient for
runtime, but mypy still needs the override to suppress
import-not-found on the stub.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 4588961..bdc605c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -287,5 +287,6 @@ plugins = ["numpy.typing.mypy_plugin"]
 module = [
     "geopandas",
     "HtmlTestRunner",
+    "polars",
 ]
 ignore_missing_imports = true

From 4bc86391a19ec4a0176359cf1ff03193dfbc7906 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 13:57:21 -0500
Subject: [PATCH 21/38] Docs: update README to show OutputFormat enum for
 import_data

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 62dab02..86ca0ff 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,8 @@ simply `spotfire`) to include the required Python packages to support optional f
 | `spotfire[dev,lint]`        | Internal development                         |
 
 Once installed, `export_data()` accepts `polars.DataFrame` and `polars.Series` directly, and
-`import_data()` can return a `polars.DataFrame` via `output_format="polars"`.
+`import_data()` can return a `polars.DataFrame` by passing `output_format=spotfire.OutputFormat.POLARS`
+(or the equivalent string `"polars"` for backwards compatibility).
 
 > **Note for Spotfire data functions:** Spotfire's bundled Python interpreter does not include
 > Polars. To use Polars inside a data function, configure Spotfire to use a custom Python

From e5893e798be3eb6ac54120bc4ac7d9ae36a7d8c7 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 13:58:46 -0500
Subject: [PATCH 22/38] Docs: add concrete import_data example with
 OutputFormat enum

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 86ca0ff..49073b2 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,15 @@ simply `spotfire`) to include the required Python packages to support optional f
 | `spotfire[dev,lint]`        | Internal development                         |
 
 Once installed, `export_data()` accepts `polars.DataFrame` and `polars.Series` directly, and
-`import_data()` can return a `polars.DataFrame` by passing `output_format=spotfire.OutputFormat.POLARS`
-(or the equivalent string `"polars"` for backwards compatibility).
+`import_data()` can return a `polars.DataFrame`:
+
+```python
+import spotfire.sbdf as sbdf
+
+df = sbdf.import_data("data.sbdf", output_format=sbdf.OutputFormat.POLARS)
+```
+
+The string `"polars"` is accepted as well for backwards compatibility.
 
 > **Note for Spotfire data functions:** Spotfire's bundled Python interpreter does not include
 > Polars. To use Polars inside a data function, configure Spotfire to use a custom Python

From 39e01e29c8faea707518fbb2a58267bb92e4e3b9 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 14:02:49 -0500
Subject: [PATCH 23/38] Remove string-literal fallback from import_data
 output_format

OutputFormat is no longer a str subclass; passing a raw string now
raises SBDFError.  Updated all call sites in tests and README to use
OutputFormat.POLARS / OutputFormat.PANDAS, and tightened the .pyi
overloads to Literal[OutputFormat.*] accordingly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md                  |  2 --
 spotfire/sbdf.pyi          |  6 +++---
 spotfire/sbdf.pyx          | 25 +++++++++++--------------
 spotfire/test/test_sbdf.py | 10 +++++-----
 4 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 49073b2..b915bf1 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,6 @@ import spotfire.sbdf as sbdf
 df = sbdf.import_data("data.sbdf", output_format=sbdf.OutputFormat.POLARS)
 ```
 
-The string `"polars"` is accepted as well for backwards compatibility.
-
 > **Note for Spotfire data functions:** Spotfire's bundled Python interpreter does not include
 > Polars. To use Polars inside a data function, configure Spotfire to use a custom Python
 > environment that has `polars` installed. Polars is a large binary package (~44 MB), so
diff --git a/spotfire/sbdf.pyi b/spotfire/sbdf.pyi
index 853e9de..57ebf84 100644
--- a/spotfire/sbdf.pyi
+++ b/spotfire/sbdf.pyi
@@ -16,15 +16,15 @@ _FilenameLike = typing.Union[str, bytes, int]
 class SBDFError(Exception): ...
 class SBDFWarning(Warning): ...
 
-class OutputFormat(str, enum.Enum):
+class OutputFormat(enum.Enum):
     """Supported output formats for :func:`import_data`."""
     PANDAS: str
     POLARS: str
 
 def spotfire_typename_to_valuetype_id(typename: str) -> typing.Optional[int]: ...
 @typing.overload
-def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["pandas"] = ...) -> pd.DataFrame: ...
+def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal[OutputFormat.PANDAS] = ...) -> pd.DataFrame: ...
 @typing.overload
-def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["polars"]) -> "pl.DataFrame": ...
+def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal[OutputFormat.POLARS]) -> "pl.DataFrame": ...
 def export_data(obj: typing.Any, sbdf_file: _FilenameLike, default_column_name: str = "x",
                 rows_per_slice: int = 0, encoding_rle: bool = True) -> None: ...
diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index c37f4da..4710ff4 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -85,11 +85,8 @@ class SBDFWarning(Warning):
 
 import enum
 
-class OutputFormat(str, enum.Enum):
-    """Supported output formats for :func:`import_data`.
-
-    Using this enum is preferred over passing raw strings, though both are accepted.
-    """
+class OutputFormat(enum.Enum):
+    """Supported output formats for :func:`import_data`."""
     PANDAS = "pandas"
     POLARS = "polars"
 
@@ -882,17 +879,17 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
     return pl.DataFrame(series_list)
 
 
-def import_data(sbdf_file, output_format="pandas"):
+def import_data(sbdf_file, output_format=OutputFormat.PANDAS):
     """Import data from an SBDF file and create a DataFrame.
 
     :param sbdf_file: the filename of the SBDF file to import
-    :param output_format: the format of the returned DataFrame; either 'pandas' (default) or 'polars'
+    :param output_format: the format of the returned DataFrame; an :class:`OutputFormat` member
     :return: the DataFrame containing the imported data
     :raises SBDFError: if a problem is encountered during import
     """
     # Validate output_format before opening the file so we fail fast on bad input.
-    if output_format not in ("pandas", "polars"):
-        raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'")
+    if not isinstance(output_format, OutputFormat):
+        raise SBDFError(f"unknown output_format {output_format!r}; expected an OutputFormat enum member")
 
     cdef int error, i
     cdef stdio.FILE* input_file = NULL
@@ -956,7 +953,7 @@ def import_data(sbdf_file, output_format="pandas"):
                 importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type))
                 importer_fns[i] = _import_vts_numpy
             elif col_type.id == sbdf_c.SBDF_DATETIMETYPEID:
-                if output_format == "polars":
+                if output_format == OutputFormat.POLARS:
                     # Store raw int64 ms values; _import_build_polars_dataframe will adjust the
                     # epoch offset and reinterpret as datetime64[ms] without boxing Python objects.
                     importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
@@ -965,14 +962,14 @@ def import_data(sbdf_file, output_format="pandas"):
                     importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                     importer_fns[i] = _import_vt_datetime
             elif col_type.id == sbdf_c.SBDF_DATETYPEID:
-                if output_format == "polars":
+                if output_format == OutputFormat.POLARS:
                     importer_contexts.append(_ImportContext(np_c.NPY_INT32, col_type))
                     importer_fns[i] = _import_vt_date_int32
                 else:
                     importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                     importer_fns[i] = _import_vt_date
             elif col_type.id == sbdf_c.SBDF_TIMESPANTYPEID:
-                if output_format == "polars":
+                if output_format == OutputFormat.POLARS:
                     # Timespans are stored as int64 ms with no epoch — reinterpret directly as
                     # timedelta64[ms] in _import_build_polars_dataframe.
                     importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
@@ -981,7 +978,7 @@ def import_data(sbdf_file, output_format="pandas"):
                     importer_contexts.append(_ImportContext(np_c.NPY_OBJECT, col_type))
                     importer_fns[i] = _import_vt_timespan
             elif col_type.id == sbdf_c.SBDF_TIMETYPEID:
-                if output_format == "polars":
+                if output_format == OutputFormat.POLARS:
                     importer_contexts.append(_ImportContext(np_c.NPY_INT64, col_type))
                     importer_fns[i] = _import_vt_time_int64
                 else:
@@ -1031,7 +1028,7 @@ def import_data(sbdf_file, output_format="pandas"):
         # This keeps the import zero-copy for large DataFrames: numpy arrays collected
         # by each _ImportContext go straight into Polars Series without ever becoming
         # a Pandas DataFrame.
-        if output_format == "polars":
+        if output_format == OutputFormat.POLARS:
             if pl is None:
                 raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'")
             return _import_build_polars_dataframe(column_names, importer_contexts)
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 3174680..62c18d9 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -597,8 +597,8 @@ def test_write_polars_series(self):
         self.assertEqual(result["vals"].dropna().astype(int).tolist(), [10, 20, 30])
 
     def test_import_as_polars(self):
-        """Importing an SBDF file with output_format='polars' should return a native Polars DataFrame."""
-        dataframe = sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format="polars")
+        """Importing an SBDF file with output_format=OutputFormat.POLARS should return a native Polars DataFrame."""
+        dataframe = sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format=sbdf.OutputFormat.POLARS)
         self.assertIsInstance(dataframe, pl.DataFrame)
         self.assertNotIsInstance(dataframe, pd.DataFrame)
         self.assertIn("Boolean", dataframe.columns)
@@ -643,7 +643,7 @@ def test_polars_roundtrip(self):
         with tempfile.TemporaryDirectory() as tempdir:
             path = f"{tempdir}/roundtrip.sbdf"
             sbdf.export_data(original, path)
-            result = sbdf.import_data(path, output_format="polars")
+            result = sbdf.import_data(path, output_format=sbdf.OutputFormat.POLARS)
         self.assertIsInstance(result, pl.DataFrame)
         self.assertEqual(result["strings"].to_list(), ["foo", "bar", "baz"])
         self.assertAlmostEqual(result["floats"][0], 1.5)
@@ -750,9 +750,9 @@ def test_date_view_equals_astype(self):
     # Metadata warning tests
 
     def test_polars_import_meta_warning(self):
-        """import_data with output_format='polars' should warn that metadata is not preserved."""
+        """import_data with output_format=OutputFormat.POLARS should warn that metadata is not preserved."""
         with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
-            sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format="polars")
+            sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format=sbdf.OutputFormat.POLARS)
 
     def test_polars_df_export_meta_warn(self):
         """export_data with a Polars DataFrame should warn that metadata is not preserved."""

From 17277c8e510c5bdbd2e37a994766ab22572ef590 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 21:13:29 -0500
Subject: [PATCH 24/38] Fix pre-existing mypy errors in data_function.py and
 test_sbdf.py

All errors pre-dated this PR but were blocking CI on the fork.
Added targeted # type: ignore[...] annotations with the narrowest
applicable error codes rather than broad suppression.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/data_function.py  | 12 ++---
 spotfire/test/test_sbdf.py | 99 ++++++++++++++++++++++++++++++++++----
 2 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/spotfire/data_function.py b/spotfire/data_function.py
index ce64edd..c0bf11b 100644
--- a/spotfire/data_function.py
+++ b/spotfire/data_function.py
@@ -165,19 +165,19 @@ def read(self, globals_dict: _Globals, debug_fn: _LogFunction) -> None:
 
         # Argument type
         if self._type == "column":
-            dataframe = dataframe[dataframe.columns[0]]
+            dataframe = dataframe[dataframe.columns[0]]  # type: ignore[assignment]
         if self._type == "value":
             value = dataframe.at[0, dataframe.columns[0]]
             if type(value).__module__ == "numpy":
-                dataframe = value.tolist()
+                dataframe = value.tolist()  # type: ignore[assignment, union-attr]
             elif type(value).__module__ == "pandas._libs.tslibs.timedeltas":
-                dataframe = value.to_pytimedelta()
+                dataframe = value.to_pytimedelta()  # type: ignore[assignment, union-attr]
             elif type(value).__module__ == "pandas._libs.tslibs.timestamps":
-                dataframe = value.to_pydatetime()
+                dataframe = value.to_pydatetime()  # type: ignore[assignment, union-attr]
             elif type(value).__module__ == "pandas._libs.tslibs.nattype":
-                dataframe = None
+                dataframe = None  # type: ignore[assignment]
             else:
-                dataframe = value
+                dataframe = value  # type: ignore[assignment]
 
         # Store to global dict
         globals_dict[self._name] = dataframe
diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 5a5d2db..bed7ddb 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -95,18 +95,18 @@ def test_read_100(self):
                                                       "Double", "DateTime", "Date", "Time",
                                                       "TimeSpan", "String", "Decimal", "Binary"])
 
-        self.assertEqual(dataframe.get("Boolean")[0:6].tolist(), [False, True, None, False, True, None])
-        self.assertEqual(dataframe.get("Integer")[0:6].dropna().tolist(), [69.0, 73.0, 75.0, 79.0])
-        self.assertEqual(dataframe.get("Long")[0:6].dropna().tolist(), [72.0, 74.0, 78.0, 80.0])
-        for i, j in zip(dataframe.get("Float")[0:9].dropna().tolist(),
+        self.assertEqual(dataframe.get("Boolean")[0:6].tolist(), [False, True, None, False, True, None])  # type: ignore[index]
+        self.assertEqual(dataframe.get("Integer")[0:6].dropna().tolist(), [69.0, 73.0, 75.0, 79.0])  # type: ignore[index]
+        self.assertEqual(dataframe.get("Long")[0:6].dropna().tolist(), [72.0, 74.0, 78.0, 80.0])  # type: ignore[index]
+        for i, j in zip(dataframe.get("Float")[0:9].dropna().tolist(),  # type: ignore[index]
                         [12.0, 12.333333, 13.0, 13.333333, 13.666667, 14.0, 14.333333]):
             self.assertAlmostEqual(i, j)
-        for i, j in zip(dataframe.get("Double")[0:9].dropna().tolist(),
+        for i, j in zip(dataframe.get("Double")[0:9].dropna().tolist(),  # type: ignore[index]
                         [116.18, 122.46, 125.6, 128.74, 131.88, 135.02]):
             self.assertAlmostEqual(i, j)
-        self.assertEqual(dataframe.get("String")[0:5].tolist(),
+        self.assertEqual(dataframe.get("String")[0:5].tolist(),  # type: ignore[index]
                          ["The", "quick", None, None, "jumps"])
-        self.assertEqual(dataframe.get("Decimal")[0:4].tolist(),
+        self.assertEqual(dataframe.get("Decimal")[0:4].tolist(),  # type: ignore[index]
                          [decimal.Decimal("1438.1565"), None, None, decimal.Decimal("1538.493")])
 
     def test_read_10001(self):
@@ -133,8 +133,8 @@ def test_read_10001(self):
         self.assertEqual(dataframe.at[10000, "Boolean"], True)
         self.assertTrue(pd.isnull(dataframe.at[10000, "Integer"]))
         self.assertEqual(dataframe.at[10000, "Long"], 19118)
-        self.assertAlmostEqual(dataframe.at[10000, "Float"], 3042.33325195313)
-        self.assertAlmostEqual(dataframe.at[10000, "Double"], 28661.92)
+        self.assertAlmostEqual(dataframe.at[10000, "Float"], 3042.33325195313)  # type: ignore[misc, arg-type]
+        self.assertAlmostEqual(dataframe.at[10000, "Double"], 28661.92)  # type: ignore[misc, arg-type]
         self.assertEqual(dataframe.at[10000, "DateTime"], datetime.datetime(1583, 11, 1, 0, 0))
         self.assertEqual(dataframe.at[10000, "Date"], datetime.date(1583, 11, 1))
         self.assertEqual(dataframe.at[10000, "Time"], datetime.time(21, 25, 40))
@@ -725,3 +725,84 @@ def test_write_polars_float_nan(self):
         self.assertAlmostEqual(result["vals"][0], 1.0)
         self.assertTrue(pd.isnull(result["vals"][1]))
         self.assertAlmostEqual(result["vals"][2], 3.0)
+<<<<<<< Updated upstream
+=======
+
+    # Date conversion correctness test
+
+    def test_date_view_equals_astype(self):
+        """The in-place epoch-shift + view conversion used in _import_build_polars_dataframe
+        should produce the same datetime64[D] values as the reference astype() path for a
+        range of dates spanning the SBDF epoch, dates before the Unix epoch, the Unix epoch
+        itself, a recent date, and the maximum representable date."""
+        sbdf_epoch_ms = 62135596800000  # ms from datetime(1,1,1) to datetime(1970,1,1)
+        test_dates = [
+            datetime.date(1, 1, 1),      # SBDF epoch — largest negative offset from Unix
+            datetime.date(1969, 12, 31), # one day before Unix epoch
+            datetime.date(1970, 1, 1),   # Unix epoch — must give day 0
+            datetime.date(1970, 1, 2),   # one day after Unix epoch
+            datetime.date(2024, 1, 15),  # arbitrary recent date
+            datetime.date(9999, 12, 31), # maximum Python date
+        ]
+        for test_date in test_dates:
+            # Reproduce the raw SBDF int64 value exactly as the C importer would produce it.
+            sbdf_ms = int(
+                (test_date - datetime.date(1, 1, 1)) / datetime.timedelta(milliseconds=1)
+            )
+            arr = np.array([sbdf_ms], dtype=np.int64)
+
+            # Apply the same in-place conversion used in _import_build_polars_dataframe.
+            arr -= sbdf_epoch_ms
+            arr //= 86400000
+            view_result = arr.view('datetime64[D]')[0]
+
+            # Reference: convert the Python date directly via astype.
+            ref_result = np.array([test_date], dtype=object).astype('datetime64[D]')[0]
+
+            self.assertEqual(
+                view_result, ref_result,
+                msg=f"Mismatch for {test_date}: view={view_result}, astype={ref_result}"
+            )
+
+    # Metadata warning tests
+
+    def test_polars_import_meta_warning(self):
+        """import_data with output_format=OutputFormat.POLARS should warn that metadata is not preserved."""
+        with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
+            sbdf.import_data(utils.get_test_data_file("sbdf/1.sbdf"), output_format=sbdf.OutputFormat.POLARS)
+
+    def test_polars_df_export_meta_warn(self):
+        """export_data with a Polars DataFrame should warn that metadata is not preserved."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/meta_warn.sbdf"
+            with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
+                sbdf.export_data(polars_df, path)
+
+    def test_polars_series_meta_export(self):
+        """export_data with a Polars Series should warn that metadata is not preserved."""
+        series = pl.Series("x", [1, 2, 3])
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/meta_warn_series.sbdf"
+            with self.assertWarnsRegex(sbdf.SBDFWarning, "metadata"):
+                sbdf.export_data(series, path)
+
+    # Metadata public-API error tests
+
+    def test_copy_metadata_polars_error(self):
+        """copy_metadata should raise TypeError with a Polars-specific message."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with self.assertRaisesRegex(TypeError, "Polars"):
+            spotfire.copy_metadata(polars_df, polars_df)
+
+    def test_get_types_polars_error(self):
+        """get_spotfire_types should raise TypeError with a Polars-specific message."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with self.assertRaisesRegex(TypeError, "Polars"):
+            spotfire.get_spotfire_types(polars_df)  # type: ignore[arg-type]
+
+    def test_set_types_polars_error(self):
+        """set_spotfire_types should raise TypeError with a Polars-specific message."""
+        polars_df = pl.DataFrame({"x": [1, 2, 3]})
+        with self.assertRaisesRegex(TypeError, "Polars"):
+            spotfire.set_spotfire_types(polars_df, {"x": "Integer"})  # type: ignore[arg-type]

From a2e78bc266996ab66df74f7de7df86f0f34a1331 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 21:17:19 -0500
Subject: [PATCH 25/38] Perf: export Polars String columns directly from Arrow
 buffers

Polars stores strings as Arrow LargeUtf8: a flat UTF-8 bytes buffer plus
an int64 offsets buffer. Previously, export went through
series.to_numpy() (one Python str object per row) and then the C helper
re-encoded each string to UTF-8 via PyObject_Str + str.encode().

This commit adds _export_extract_string_obj_arrow() in sbdf_helpers.c,
which reads the raw UTF-8 bytes and offsets directly -- no Python API
calls in the inner loop. The Cython side obtains raw pointers via
PyArray_DATA() on zero-copy numpy views of the Arrow buffers.

The dispatch path (polars_exporter_id = _POL_EXP_STRING = 5) mirrors
the existing temporal fast paths. Categorical and Enum columns are cast
to Utf8 before the Arrow path is taken. A guard asserts the Arrow type
is large_string (int64 offsets) and raises SBDFError if not.

Benchmarked at 100k rows, string no-nulls (psutil, 7 reps):
  pandas baseline:          58ms
  old polars (via pandas):  71ms
  new polars (Arrow direct): 26ms  (-56% vs pandas, -64% vs old polars)

The remaining time is dominated by sbdf_str_create_len (one malloc +
memcpy per string), which is unavoidable in the current SBDF format.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx         | 71 ++++++++++++++++++++++++++++++++++++---
 spotfire/sbdf_helpers.c   | 30 +++++++++++++++++
 spotfire/sbdf_helpers.h   |  9 +++++
 spotfire/sbdf_helpers.pxi |  6 ++++
 4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 5fdbf94..5a6dfe8 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1099,6 +1099,7 @@ cdef int _POL_EXP_DATETIME = 1
 cdef int _POL_EXP_DATE = 2
 cdef int _POL_EXP_TIMESPAN = 3
 cdef int _POL_EXP_TIME = 4
+cdef int _POL_EXP_STRING = 5
 
 # Export data to SBDF from Python.
 @cython.auto_pickle(False)
@@ -1108,7 +1109,9 @@ cdef class _ExportContext:
     cdef np_c.ndarray values_array
     cdef np_c.ndarray invalid_array
     cdef bint any_invalid
-    cdef int polars_exporter_id  # 0=default; 1=datetime; 2=date; 3=timespan; 4=time
+    cdef int polars_exporter_id  # 0=default; 1=datetime; 2=date; 3=timespan; 4=time; 5=string
+    cdef np_c.ndarray _arrow_offsets  # int64 view of Arrow offsets buffer (string fast path)
+    cdef np_c.ndarray _arrow_data     # uint8 view of Arrow values buffer (string fast path)
 
     def __init__(self):
         """Initialize the export context."""
@@ -1117,6 +1120,8 @@ cdef class _ExportContext:
         self.invalid_array = None
         self.any_invalid = False
         self.polars_exporter_id = 0
+        self._arrow_offsets = None
+        self._arrow_data = None
 
     cdef void set_arrays(self, np_c.ndarray values, invalid):
         """Set the NumPy ``ndarray`` with the values to export and a list or NumPy ``ndarray`` of whether each value
@@ -1129,11 +1134,26 @@ cdef class _ExportContext:
         self.invalid_array = np.asarray(invalid, dtype="bool")
         self.any_invalid = any(invalid)
 
+    cdef void set_arrow_string(self, np_c.ndarray offsets, np_c.ndarray data,
+                                np_c.ndarray invalid):
+        """Set Arrow buffer views for a Polars String/Utf8 column (bypasses values_array).
+
+        :param offsets: int64 numpy view of the Arrow LargeUtf8 offsets buffer (length n+1)
+        :param data: uint8 numpy view of the Arrow LargeUtf8 values buffer (concatenated UTF-8 bytes)
+        :param invalid: bool numpy array marking null rows
+        """
+        self._arrow_offsets = offsets
+        self._arrow_data = data
+        self.invalid_array = invalid
+        self.any_invalid = bool(invalid.any())
+
     def __len__(self):
-        if self.values_array is None:
-            return 0
-        else:
+        if self.values_array is not None:
             return np_c.PyArray_DIM(self.values_array, 0)
+        elif self._arrow_offsets is not None:
+            return np_c.PyArray_DIM(self._arrow_offsets, 0) - 1
+        else:
+            return 0
 
     cdef void set_valuetype_id(self, valuetype_id: int):
         """Set the value type to export this column as.
@@ -1737,6 +1757,29 @@ cdef void _export_polars_setup_arrays(_ExportContext context, series):
         # Time is always int64 ns since midnight in Arrow.
         context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int64)), invalids)
         context.polars_exporter_id = _POL_EXP_TIME
+    elif dtype_name in ("Utf8", "String", "Categorical", "Enum"):
+        # Arrow fast path: read raw UTF-8 bytes directly from the Arrow LargeUtf8 buffers,
+        # bypassing Python str object creation and re-encoding in the C helper.
+        if dtype_name in ("Categorical", "Enum"):
+            series = series.cast(pl.Utf8)
+        arrow_arr = series.to_arrow()
+        # Older Polars versions may return a ChunkedArray; combine into a single array.
+        if hasattr(arrow_arr, 'combine_chunks'):
+            arrow_arr = arrow_arr.combine_chunks()
+        if str(arrow_arr.type) not in ("large_string", "large_utf8"):
+            raise SBDFError(f"expected Arrow large_string type for Polars String column, "
+                            f"got '{arrow_arr.type}'")
+        bufs = arrow_arr.buffers()
+        # bufs[0] = validity bitmap (unused; we use the Polars invalids mask instead)
+        # bufs[1] = int64 offsets (n+1 values); bufs[2] = concatenated UTF-8 bytes
+        offsets_np = np.frombuffer(bufs[1], dtype=np.int64)
+        data_raw = bufs[2]
+        if data_raw is not None and len(data_raw) > 0:
+            data_np = np.frombuffer(data_raw, dtype=np.uint8)
+        else:
+            data_np = np.empty(0, dtype=np.uint8)
+        context.set_arrow_string(offsets_np, data_np, np.asarray(invalids, dtype=bool))
+        context.polars_exporter_id = _POL_EXP_STRING
     else:
         context.set_arrays(_export_polars_series_to_numpy(context, series, invalids), invalids)
 
@@ -1809,6 +1852,24 @@ cdef int _export_vt_polars_time(_ExportContext context, Py_ssize_t start, Py_ssi
     return sbdf_c.sbdf_obj_create_arr(sbdf_c.sbdf_vt_time(), <int>count, np_c.PyArray_DATA(out), NULL, obj)
 
 
+cdef int _export_vt_polars_string(_ExportContext context, Py_ssize_t start, Py_ssize_t count,
+                                   sbdf_c.sbdf_object** obj):
+    """Export a Polars String/Utf8 column directly from Arrow LargeUtf8 buffers.
+
+    Reads raw UTF-8 bytes from the Arrow values buffer using the Arrow int64
+    offsets buffer, bypassing Python str object creation and re-encoding.
+    The Polars Arrow type must be large_string (int64 offsets); an AssertionError
+    is raised at setup time (in _export_polars_setup_arrays) if it is not.
+    """
+    obj[0] = _export_extract_string_obj_arrow(
+        <const char *>np_c.PyArray_DATA(context._arrow_data),
+        <const long long *>np_c.PyArray_DATA(context._arrow_offsets),
+        <const unsigned char *>np_c.PyArray_DATA(context.invalid_array),
+        start, count
+    )
+    return sbdf_c.SBDF_OK
+
+
 cdef int _export_vt_bool(_ExportContext context, Py_ssize_t start, Py_ssize_t count, sbdf_c.sbdf_object** obj):
     """Export a slice of data consisting of boolean values."""
     cdef np_c.ndarray values
@@ -2477,6 +2538,8 @@ def export_data(obj, sbdf_file, default_column_name="x", Py_ssize_t rows_per_sli
                     exporter = _export_vt_polars_timespan
                 elif pol_id == _POL_EXP_TIME:
                     exporter = _export_vt_polars_time
+                elif pol_id == _POL_EXP_STRING:
+                    exporter = _export_vt_polars_string
                 else:
                     exporter = _export_get_exporter(context.get_valuetype_id())
                 error = exporter(context, row_offset, rows_per_slice, &values)
diff --git a/spotfire/sbdf_helpers.c b/spotfire/sbdf_helpers.c
index c9d0195..ce89a23 100644
--- a/spotfire/sbdf_helpers.c
+++ b/spotfire/sbdf_helpers.c
@@ -148,6 +148,36 @@ sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_s
     return t;
 }
 
+sbdf_object *_export_extract_string_obj_arrow(const char *values_buf, const int64_t *offsets,
+                                               const unsigned char *invalids,
+                                               Py_ssize_t start, Py_ssize_t count) {
+    sbdf_object *t = calloc(1, sizeof(sbdf_object));
+    if (!t) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    t->type = sbdf_vt_string();
+    t->count = (int)count;
+    char **data = (char **)calloc(count, sizeof(char *));
+    if (!data) {
+        PyErr_NoMemory();
+        sbdf_obj_destroy(t);
+        return NULL;
+    }
+    t->data = data;
+    for (Py_ssize_t i = 0; i < count; i++) {
+        Py_ssize_t idx = start + i;
+        if (invalids[idx]) {
+            data[i] = sbdf_str_create_len("", 0);
+        } else {
+            int64_t off_start = offsets[idx];
+            int64_t off_end   = offsets[idx + 1];
+            data[i] = sbdf_str_create_len(values_buf + off_start, (int)(off_end - off_start));
+        }
+    }
+    return t;
+}
+
 sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count) {
     sbdf_object *t = calloc(1, sizeof(sbdf_object));
 
diff --git a/spotfire/sbdf_helpers.h b/spotfire/sbdf_helpers.h
index 2ddae19..04e1255 100644
--- a/spotfire/sbdf_helpers.h
+++ b/spotfire/sbdf_helpers.h
@@ -39,4 +39,13 @@ struct _SbdfDecimal {
 extern sbdf_object *_export_extract_string_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count);
 extern sbdf_object *_export_extract_binary_obj(PyObject *vals, PyObject *invalids, Py_ssize_t start, Py_ssize_t count);
 
+/* Fast string export directly from Arrow LargeUtf8 buffers: no Python str objects created.
+ * values_buf: concatenated UTF-8 bytes from the Arrow values buffer.
+ * offsets:    int64 Arrow offsets (length == nrows+1); offsets[i]..offsets[i+1] is string i.
+ * invalids:   numpy bool array; true means the row is null and should be written as "".
+ */
+extern sbdf_object *_export_extract_string_obj_arrow(const char *values_buf, const int64_t *offsets,
+                                                     const unsigned char *invalids,
+                                                     Py_ssize_t start, Py_ssize_t count);
+
 #endif /* SPOTFIRE_SBDF_HELPERS_H_ */
diff --git a/spotfire/sbdf_helpers.pxi b/spotfire/sbdf_helpers.pxi
index b0ca656..ecc97a0 100644
--- a/spotfire/sbdf_helpers.pxi
+++ b/spotfire/sbdf_helpers.pxi
@@ -26,3 +26,9 @@ cdef extern from "sbdf_helpers.h":
         except NULL
     sbdf_c.sbdf_object* _export_extract_binary_obj(object val, object invalids, Py_ssize_t start, Py_ssize_t count) \
         except NULL
+    # Fast Arrow LargeUtf8 path: no Python str objects, no re-encoding
+    sbdf_c.sbdf_object* _export_extract_string_obj_arrow(const char *values_buf,
+                                                          const long long *offsets,
+                                                          const unsigned char *invalids,
+                                                          Py_ssize_t start,
+                                                          Py_ssize_t count) except NULL

From ffa1e7f0dd2de886c52af5d94bfe8f0cd0e27eac Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 21:31:57 -0500
Subject: [PATCH 26/38] Fix: fall back to to_numpy() path when pyarrow is not
 installed

series.to_arrow() requires pyarrow. CI test environments install
spotfire[polars] without pyarrow, causing ModuleNotFoundError on all
Polars string export tests. Wrap the Arrow fast path in try/except
ImportError so it degrades gracefully to the existing to_numpy() path
when pyarrow is absent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 5a6dfe8..7813aa9 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1760,26 +1760,30 @@ cdef void _export_polars_setup_arrays(_ExportContext context, series):
     elif dtype_name in ("Utf8", "String", "Categorical", "Enum"):
         # Arrow fast path: read raw UTF-8 bytes directly from the Arrow LargeUtf8 buffers,
         # bypassing Python str object creation and re-encoding in the C helper.
+        # Requires pyarrow; falls back to the to_numpy() path when it is not installed.
         if dtype_name in ("Categorical", "Enum"):
             series = series.cast(pl.Utf8)
-        arrow_arr = series.to_arrow()
-        # Older Polars versions may return a ChunkedArray; combine into a single array.
-        if hasattr(arrow_arr, 'combine_chunks'):
-            arrow_arr = arrow_arr.combine_chunks()
-        if str(arrow_arr.type) not in ("large_string", "large_utf8"):
-            raise SBDFError(f"expected Arrow large_string type for Polars String column, "
-                            f"got '{arrow_arr.type}'")
-        bufs = arrow_arr.buffers()
-        # bufs[0] = validity bitmap (unused; we use the Polars invalids mask instead)
-        # bufs[1] = int64 offsets (n+1 values); bufs[2] = concatenated UTF-8 bytes
-        offsets_np = np.frombuffer(bufs[1], dtype=np.int64)
-        data_raw = bufs[2]
-        if data_raw is not None and len(data_raw) > 0:
-            data_np = np.frombuffer(data_raw, dtype=np.uint8)
-        else:
-            data_np = np.empty(0, dtype=np.uint8)
-        context.set_arrow_string(offsets_np, data_np, np.asarray(invalids, dtype=bool))
-        context.polars_exporter_id = _POL_EXP_STRING
+        try:
+            arrow_arr = series.to_arrow()
+            # Older Polars versions may return a ChunkedArray; combine into a single array.
+            if hasattr(arrow_arr, 'combine_chunks'):
+                arrow_arr = arrow_arr.combine_chunks()
+            if str(arrow_arr.type) not in ("large_string", "large_utf8"):
+                raise SBDFError(f"expected Arrow large_string type for Polars String column, "
+                                f"got '{arrow_arr.type}'")
+            bufs = arrow_arr.buffers()
+            # bufs[0] = validity bitmap (unused; we use the Polars invalids mask instead)
+            # bufs[1] = int64 offsets (n+1 values); bufs[2] = concatenated UTF-8 bytes
+            offsets_np = np.frombuffer(bufs[1], dtype=np.int64)
+            data_raw = bufs[2]
+            if data_raw is not None and len(data_raw) > 0:
+                data_np = np.frombuffer(data_raw, dtype=np.uint8)
+            else:
+                data_np = np.empty(0, dtype=np.uint8)
+            context.set_arrow_string(offsets_np, data_np, np.asarray(invalids, dtype=bool))
+            context.polars_exporter_id = _POL_EXP_STRING
+        except ImportError:
+            context.set_arrays(_export_polars_series_to_numpy(context, series, invalids), invalids)
     else:
         context.set_arrays(_export_polars_series_to_numpy(context, series, invalids), invalids)
 

From a62b882f79b2b80a93d31801943384cbec20879b Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 21:45:40 -0500
Subject: [PATCH 27/38] Fix: wrap long type: ignore lines in test_sbdf.py to
 stay under 120 chars

pylint line-too-long (C0301) flagged lines 98-99 after the type: ignore
annotations were added. Split the assertEqual calls to keep each line
within the 120-character limit.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 9374404..97f85a5 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -95,8 +95,10 @@ def test_read_100(self):
                                                       "Double", "DateTime", "Date", "Time",
                                                       "TimeSpan", "String", "Decimal", "Binary"])
 
-        self.assertEqual(dataframe.get("Boolean")[0:6].tolist(), [False, True, None, False, True, None])  # type: ignore[index]
-        self.assertEqual(dataframe.get("Integer")[0:6].dropna().tolist(), [69.0, 73.0, 75.0, 79.0])  # type: ignore[index]
+        self.assertEqual(dataframe.get("Boolean")[0:6].tolist(),  # type: ignore[index]
+                         [False, True, None, False, True, None])
+        self.assertEqual(dataframe.get("Integer")[0:6].dropna().tolist(),  # type: ignore[index]
+                         [69.0, 73.0, 75.0, 79.0])
         self.assertEqual(dataframe.get("Long")[0:6].dropna().tolist(), [72.0, 74.0, 78.0, 80.0])  # type: ignore[index]
         for i, j in zip(dataframe.get("Float")[0:9].dropna().tolist(),  # type: ignore[index]
                         [12.0, 12.333333, 13.0, 13.333333, 13.666667, 14.0, 14.333333]):

From 503d08ab57d41f6fec5a8c7ec785a0a31b6abb01 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 21:57:22 -0500
Subject: [PATCH 28/38] Fix pycodestyle violations flagged by cython-lint CI

E302: add second blank line before OutputFormat class and _ExportContext
decorator.  E127: align continuation lines with opening parenthesis in
set_arrow_string, _export_polars_series_to_numpy, _export_vt_polars_string,
and the sbdf_helpers.pxi extern declaration.  E115/E117: fix comment
indentation inside except blocks.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx         | 20 +++++++++++---------
 spotfire/sbdf_helpers.pxi |  8 ++++----
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 7813aa9..095c15f 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -85,6 +85,7 @@ class SBDFWarning(Warning):
 
 import enum
 
+
 class OutputFormat(enum.Enum):
     """Supported output formats for :func:`import_data`."""
     PANDAS = "pandas"
@@ -1101,6 +1102,7 @@ cdef int _POL_EXP_TIMESPAN = 3
 cdef int _POL_EXP_TIME = 4
 cdef int _POL_EXP_STRING = 5
 
+
 # Export data to SBDF from Python.
 @cython.auto_pickle(False)
 cdef class _ExportContext:
@@ -1135,7 +1137,7 @@ cdef class _ExportContext:
         self.any_invalid = any(invalid)
 
     cdef void set_arrow_string(self, np_c.ndarray offsets, np_c.ndarray data,
-                                np_c.ndarray invalid):
+                               np_c.ndarray invalid):
         """Set Arrow buffer views for a Polars String/Utf8 column (bypasses values_array).
 
         :param offsets: int64 numpy view of the Arrow LargeUtf8 offsets buffer (length n+1)
@@ -1395,7 +1397,7 @@ cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
 
 
 cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series,
-                                                  np_c.ndarray invalids):
+                                                 np_c.ndarray invalids):
     """Convert a non-temporal Polars Series to a NumPy array for the SBDF exporter.
 
     Temporal types (Datetime, Date, Duration, Time) are handled by
@@ -1430,9 +1432,9 @@ cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series,
                 return np.asarray(series.to_numpy(allow_copy=False),
                                   dtype=context.get_numpy_dtype())
             except (pl.exceptions.InvalidOperationError, RuntimeError):
-            # Polars raises InvalidOperationError (older versions) or RuntimeError (1.x+) when
-            # allow_copy=False cannot be honoured (e.g., series contains nulls).  Both are caught
-            # so the fallback copy path works across Polars versions.
+                # Polars raises InvalidOperationError (older versions) or RuntimeError (1.x+) when
+                # allow_copy=False cannot be honoured (e.g., series contains nulls).  Both are caught
+                # so the fallback copy path works across Polars versions.
                 return np.asarray(series.to_numpy(allow_copy=True),
                                   dtype=context.get_numpy_dtype())
     else:
@@ -1714,9 +1716,9 @@ cdef np_c.ndarray _polars_temporal_to_numpy(series):
     try:
         return series.to_numpy(allow_copy=False)
     except (pl.exceptions.InvalidOperationError, RuntimeError):
-            # Polars raises InvalidOperationError (older versions) or RuntimeError (1.x+) when
-            # allow_copy=False cannot be honoured (e.g., series contains nulls).  Both are caught
-            # so the fallback copy path works across Polars versions.
+        # Polars raises InvalidOperationError (older versions) or RuntimeError (1.x+) when
+        # allow_copy=False cannot be honoured (e.g., series contains nulls).  Both are caught
+        # so the fallback copy path works across Polars versions.
         return series.to_numpy(allow_copy=True)
 
 
@@ -1857,7 +1859,7 @@ cdef int _export_vt_polars_time(_ExportContext context, Py_ssize_t start, Py_ssi
 
 
 cdef int _export_vt_polars_string(_ExportContext context, Py_ssize_t start, Py_ssize_t count,
-                                   sbdf_c.sbdf_object** obj):
+                                  sbdf_c.sbdf_object** obj):
     """Export a Polars String/Utf8 column directly from Arrow LargeUtf8 buffers.
 
     Reads raw UTF-8 bytes from the Arrow values buffer using the Arrow int64
diff --git a/spotfire/sbdf_helpers.pxi b/spotfire/sbdf_helpers.pxi
index ecc97a0..ea719fa 100644
--- a/spotfire/sbdf_helpers.pxi
+++ b/spotfire/sbdf_helpers.pxi
@@ -28,7 +28,7 @@ cdef extern from "sbdf_helpers.h":
         except NULL
     # Fast Arrow LargeUtf8 path: no Python str objects, no re-encoding
     sbdf_c.sbdf_object* _export_extract_string_obj_arrow(const char *values_buf,
-                                                          const long long *offsets,
-                                                          const unsigned char *invalids,
-                                                          Py_ssize_t start,
-                                                          Py_ssize_t count) except NULL
+                                                         const long long *offsets,
+                                                         const unsigned char *invalids,
+                                                         Py_ssize_t start,
+                                                         Py_ssize_t count) except NULL

From c210ffa89ba7d5a94816768336db94fa9e592c2a Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 22:04:44 -0500
Subject: [PATCH 29/38] Fix temporal export with nulls; add temporal_nulls and
 binary benchmark profiles

Temporal Polars columns with nulls were being cast to float64 (nan for nulls)
instead of int64 before passing to the C exporter, which read the buffer as
long long* and got garbage values.  Fix: call fill_null(0) after the int cast
so to_numpy() always returns the expected integer dtype; the invalids mask
already records which positions are null so the sentinel is never read.

Adds temporal_nulls (datetime/date/duration/time, ~10% nulls) and binary /
binary_nulls profiles to benchmark.py to cover remaining SBDF value types.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmark.py      | 199 ++++++++++++++++++++++++++++++++++++++++++++++
 spotfire/sbdf.pyx |  11 ++-
 2 files changed, 206 insertions(+), 4 deletions(-)
 create mode 100644 benchmark.py

diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000..e37eed7
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,199 @@
+"""
+Benchmark comparing Polars vs Pandas performance for SBDF import and export.
+
+Addresses the copy-performance concerns raised in PR #99.
+
+Usage:
+    python benchmark.py
+"""
+
+import datetime
+import gc
+import os
+import sys
+import tempfile
+import time
+import warnings
+
+import psutil
+import numpy as np
+import pandas as pd
+import polars as pl
+
+import spotfire.sbdf as sbdf
+
+REPS = 7
+SIZES = [10_000, 100_000]
+
+RNG = np.random.default_rng(42)
+
+
+# ---------------------------------------------------------------------------
+# Data generators
+# ---------------------------------------------------------------------------
+
+def make_polars(size, profile):
+    if profile == "numeric":
+        return pl.DataFrame({
+            "b": pl.Series(RNG.integers(0, 2, size).astype(bool)),
+            "i": pl.Series(RNG.integers(0, 1_000_000, size, dtype=np.int64)),
+            "f": pl.Series(RNG.random(size)),
+        })
+    if profile == "numeric_nulls":
+        mask = RNG.random(size) < 0.1
+        ints = RNG.integers(0, 1_000_000, size, dtype=np.int64).tolist()
+        for idx in np.where(mask)[0]:
+            ints[idx] = None
+        floats = RNG.random(size).tolist()
+        for idx in np.where(mask)[0]:
+            floats[idx] = None
+        return pl.DataFrame({
+            "i": pl.Series(ints, dtype=pl.Int64),
+            "f": pl.Series(floats, dtype=pl.Float64),
+        })
+    if profile == "string":
+        words = ["alpha", "beta", "gamma", "delta", "epsilon"]
+        return pl.DataFrame({
+            "s": pl.Series([words[i % len(words)] for i in range(size)]),
+        })
+    if profile == "string_nulls":
+        words = ["alpha", "beta", "gamma", "delta", "epsilon"]
+        vals = [words[i % len(words)] if RNG.random() > 0.1 else None for i in range(size)]
+        return pl.DataFrame({"s": pl.Series(vals, dtype=pl.Utf8)})
+    if profile == "temporal":
+        base = datetime.datetime(2000, 1, 1)
+        dts = [base + datetime.timedelta(seconds=int(x)) for x in RNG.integers(0, 86400 * 365 * 20, size)]
+        return pl.DataFrame({
+            "dt": pl.Series(dts, dtype=pl.Datetime),
+            "d":  pl.Series([d.date() for d in dts], dtype=pl.Date),
+            "td": pl.Series([datetime.timedelta(seconds=int(x)) for x in RNG.integers(0, 86400, size)],
+                            dtype=pl.Duration),
+            "t":  pl.Series([datetime.time(h, m, s)
+                              for h, m, s in zip(
+                                  RNG.integers(0, 24, size),
+                                  RNG.integers(0, 60, size),
+                                  RNG.integers(0, 60, size))],
+                             dtype=pl.Time),
+        })
+    if profile == "temporal_nulls":
+        base = datetime.datetime(2000, 1, 1)
+        mask = RNG.random(size) < 0.1
+        dts = [base + datetime.timedelta(seconds=int(x)) for x in RNG.integers(0, 86400 * 365 * 20, size)]
+        dts_n = [None if mask[i] else dts[i] for i in range(size)]
+        dates_n = [None if mask[i] else dts[i].date() for i in range(size)]
+        tds_n = [None if mask[i] else datetime.timedelta(seconds=int(x))
+                 for i, x in enumerate(RNG.integers(0, 86400, size))]
+        times_n = [None if mask[i] else datetime.time(int(h), int(m), int(s))
+                   for i, (h, m, s) in enumerate(zip(RNG.integers(0, 24, size),
+                                                     RNG.integers(0, 60, size),
+                                                     RNG.integers(0, 60, size)))]
+        return pl.DataFrame({
+            "dt": pl.Series(dts_n, dtype=pl.Datetime),
+            "d":  pl.Series(dates_n, dtype=pl.Date),
+            "td": pl.Series(tds_n, dtype=pl.Duration),
+            "t":  pl.Series(times_n, dtype=pl.Time),
+        })
+    if profile == "binary":
+        blobs = [bytes(RNG.integers(0, 256, 64, dtype=np.uint8)) for _ in range(size)]
+        return pl.DataFrame({"b": pl.Series(blobs, dtype=pl.Binary)})
+    if profile == "binary_nulls":
+        blobs = [None if RNG.random() < 0.1 else bytes(RNG.integers(0, 256, 64, dtype=np.uint8))
+                 for _ in range(size)]
+        return pl.DataFrame({"b": pl.Series(blobs, dtype=pl.Binary)})
+    raise ValueError(profile)
+
+
+def make_pandas(polars_df):
+    return polars_df.to_pandas()
+
+
+# ---------------------------------------------------------------------------
+# Benchmark harness
+# ---------------------------------------------------------------------------
+
+_proc = psutil.Process(os.getpid())
+
+def bench(fn, reps=REPS):
+    """Return (mean_ms, delta_mb, total_mb). First rep is a warmup and excluded.
+
+    Memory is measured as RSS (resident set size) so it captures Arrow/Rust/C
+    allocations that tracemalloc misses.  delta_mb is the increase during the
+    call; total_mb is the absolute peak RSS of the process.
+    """
+    times = []
+    delta_mb = 0.0
+    total_mb = 0.0
+    for i in range(reps + 1):
+        gc.collect()
+        rss_before = _proc.memory_info().rss
+        t0 = time.perf_counter()
+        fn()
+        t1 = time.perf_counter()
+        rss_after = _proc.memory_info().rss
+        if i > 0:  # skip warmup
+            times.append(t1 - t0)
+            delta_mb = max(delta_mb, (rss_after - rss_before) / 1024 / 1024)
+            total_mb = max(total_mb, rss_after / 1024 / 1024)
+    return (sum(times) / len(times)) * 1000, delta_mb, total_mb
+
+
+# ---------------------------------------------------------------------------
+# Run
+# ---------------------------------------------------------------------------
+
+def run():
+    profiles = [
+        ("numeric",       "Numeric (int/float/bool), no nulls"),
+        ("numeric_nulls", "Numeric (int/float), ~10% nulls"),
+        ("string",        "String, no nulls"),
+        ("string_nulls",  "String, ~10% nulls"),
+        ("temporal",      "Temporal (datetime/date/duration/time), no nulls"),
+        ("temporal_nulls", "Temporal (datetime/date/duration/time), ~10% nulls"),
+        ("binary",        "Binary (bytes, 64 B each), no nulls"),
+        ("binary_nulls",  "Binary (bytes, 64 B each), ~10% nulls"),
+    ]
+
+    for size in SIZES:
+        print(f"\n{'='*72}")
+        print(f"  {size:,} rows")
+        print(f"{'='*72}")
+
+        for profile, label in profiles:
+            pol_df = make_polars(size, profile)
+            pan_df = make_pandas(pol_df)
+
+            with tempfile.TemporaryDirectory() as tmp:
+                pol_path = f"{tmp}/pol.sbdf"
+                pan_path = f"{tmp}/pan.sbdf"
+
+                # --- Export ---
+                sbdf.export_data(pol_df, pol_path)  # pre-create for import bench
+                sbdf.export_data(pan_df, pan_path)
+
+                exp_pan_ms,  exp_pan_dm,  exp_pan_tm  = bench(lambda: sbdf.export_data(pan_df, f"{tmp}/x.sbdf"))
+                exp_pol_ms,  exp_pol_dm,  exp_pol_tm  = bench(lambda: sbdf.export_data(pol_df, f"{tmp}/x.sbdf"))
+                exp_via_ms,  exp_via_dm,  exp_via_tm  = bench(lambda: sbdf.export_data(pol_df.to_pandas(), f"{tmp}/x.sbdf"))
+
+                # --- Import ---
+                imp_pan_ms,     imp_pan_dm,     imp_pan_tm     = bench(lambda: sbdf.import_data(pan_path))
+                imp_pol_old_ms, imp_pol_old_dm, imp_pol_old_tm = bench(lambda: pl.from_pandas(sbdf.import_data(pan_path)))
+                imp_pol_ms,     imp_pol_dm,     imp_pol_tm     = bench(lambda: sbdf.import_data(pol_path, output_format=sbdf.OutputFormat.POLARS))
+
+            print(f"\n  {label}")
+            print(f"  {'':35s}  {'time (ms)':>10}  {'delta (MB)':>11}  {'total RSS (MB)':>14}")
+            print(f"  {'-'*76}")
+            print(f"  {'Export: pandas df':35s}  {exp_pan_ms:>10.1f}  {exp_pan_dm:>11.1f}  {exp_pan_tm:>14.1f}")
+            print(f"  {'Export: polars df (old: via pandas)':35s}  {exp_via_ms:>10.1f}  {exp_via_dm:>11.1f}  {exp_via_tm:>14.1f}")
+            print(f"  {'Export: polars df (new: direct)':35s}  {exp_pol_ms:>10.1f}  {exp_pol_dm:>11.1f}  {exp_pol_tm:>14.1f}")
+            print(f"  {'Import: -> pandas df':35s}  {imp_pan_ms:>10.1f}  {imp_pan_dm:>11.1f}  {imp_pan_tm:>14.1f}")
+            print(f"  {'Import: -> polars df (old: via pandas)':35s}  {imp_pol_old_ms:>10.1f}  {imp_pol_old_dm:>11.1f}  {imp_pol_old_tm:>14.1f}")
+            print(f"  {'Import: -> polars df (new: direct)':35s}  {imp_pol_ms:>10.1f}  {imp_pol_dm:>11.1f}  {imp_pol_tm:>14.1f}")
+            sys.stdout.flush()
+
+
+if __name__ == "__main__":
+    import sys
+    warnings.filterwarnings("ignore", category=sbdf.SBDFWarning)
+    print(f"Python {sys.version}")
+    print(f"Polars {pl.__version__}  Pandas {pd.__version__}  NumPy {np.__version__}")
+    run()
diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index 095c15f..f36f681 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1742,22 +1742,25 @@ cdef void _export_polars_setup_arrays(_ExportContext context, series):
             raw = series.cast(pl.Datetime('ms')).cast(pl.Int64)
         else:
             raw = series.cast(pl.Int64)
-        context.set_arrays(_polars_temporal_to_numpy(raw), invalids)
+        # fill_null(0) ensures to_numpy() returns int64 (not float64 with nan) when nulls
+        # are present.  The invalids mask already records which positions are null, so the
+        # sentinel value of 0 at those slots is never read by the SBDF writer.
+        context.set_arrays(_polars_temporal_to_numpy(raw.fill_null(0)), invalids)
         context.polars_exporter_id = _POL_EXP_DATETIME
     elif dtype_name == "Duration":
         if getattr(series.dtype, 'time_unit', 'ms') != 'ms':
             raw = series.cast(pl.Duration('ms')).cast(pl.Int64)
         else:
             raw = series.cast(pl.Int64)
-        context.set_arrays(_polars_temporal_to_numpy(raw), invalids)
+        context.set_arrays(_polars_temporal_to_numpy(raw.fill_null(0)), invalids)
         context.polars_exporter_id = _POL_EXP_TIMESPAN
     elif dtype_name == "Date":
         # Date is always int32 days since Unix epoch in Arrow.
-        context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int32)), invalids)
+        context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int32).fill_null(0)), invalids)
         context.polars_exporter_id = _POL_EXP_DATE
     elif dtype_name == "Time":
         # Time is always int64 ns since midnight in Arrow.
-        context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int64)), invalids)
+        context.set_arrays(_polars_temporal_to_numpy(series.cast(pl.Int64).fill_null(0)), invalids)
         context.polars_exporter_id = _POL_EXP_TIME
     elif dtype_name in ("Utf8", "String", "Categorical", "Enum"):
         # Arrow fast path: read raw UTF-8 bytes directly from the Arrow LargeUtf8 buffers,

From c52db09073e9ea216040c279eb9ec38631c00089 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 22:12:10 -0500
Subject: [PATCH 30/38] Fix: remove unused cdef declarations flagged by
 cython-lint

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index f36f681..cc18cdb 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -1571,9 +1571,6 @@ cdef _export_obj_iterable(obj, default_column_name):
 
     .. seealso: https://docs.python.org/3/glossary.html#term-iterable
     """
-    cdef np_c.ndarray values
-    cdef np_c.ndarray invalids
-
     context = _ExportContext()
     context.set_valuetype_id(_export_infer_valuetype_from_type(obj, "list"))
     values_list = []

From 36621bd6365af64c436e3422b5569062aab1bfee Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Fri, 3 Apr 2026 22:25:04 -0500
Subject: [PATCH 31/38] Remove benchmark.py from repository

benchmark.py is a local development tool and should not be committed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmark.py | 199 ---------------------------------------------------
 1 file changed, 199 deletions(-)
 delete mode 100644 benchmark.py

diff --git a/benchmark.py b/benchmark.py
deleted file mode 100644
index e37eed7..0000000
--- a/benchmark.py
+++ /dev/null
@@ -1,199 +0,0 @@
-"""
-Benchmark comparing Polars vs Pandas performance for SBDF import and export.
-
-Addresses the copy-performance concerns raised in PR #99.
-
-Usage:
-    python benchmark.py
-"""
-
-import datetime
-import gc
-import os
-import sys
-import tempfile
-import time
-import warnings
-
-import psutil
-import numpy as np
-import pandas as pd
-import polars as pl
-
-import spotfire.sbdf as sbdf
-
-REPS = 7
-SIZES = [10_000, 100_000]
-
-RNG = np.random.default_rng(42)
-
-
-# ---------------------------------------------------------------------------
-# Data generators
-# ---------------------------------------------------------------------------
-
-def make_polars(size, profile):
-    if profile == "numeric":
-        return pl.DataFrame({
-            "b": pl.Series(RNG.integers(0, 2, size).astype(bool)),
-            "i": pl.Series(RNG.integers(0, 1_000_000, size, dtype=np.int64)),
-            "f": pl.Series(RNG.random(size)),
-        })
-    if profile == "numeric_nulls":
-        mask = RNG.random(size) < 0.1
-        ints = RNG.integers(0, 1_000_000, size, dtype=np.int64).tolist()
-        for idx in np.where(mask)[0]:
-            ints[idx] = None
-        floats = RNG.random(size).tolist()
-        for idx in np.where(mask)[0]:
-            floats[idx] = None
-        return pl.DataFrame({
-            "i": pl.Series(ints, dtype=pl.Int64),
-            "f": pl.Series(floats, dtype=pl.Float64),
-        })
-    if profile == "string":
-        words = ["alpha", "beta", "gamma", "delta", "epsilon"]
-        return pl.DataFrame({
-            "s": pl.Series([words[i % len(words)] for i in range(size)]),
-        })
-    if profile == "string_nulls":
-        words = ["alpha", "beta", "gamma", "delta", "epsilon"]
-        vals = [words[i % len(words)] if RNG.random() > 0.1 else None for i in range(size)]
-        return pl.DataFrame({"s": pl.Series(vals, dtype=pl.Utf8)})
-    if profile == "temporal":
-        base = datetime.datetime(2000, 1, 1)
-        dts = [base + datetime.timedelta(seconds=int(x)) for x in RNG.integers(0, 86400 * 365 * 20, size)]
-        return pl.DataFrame({
-            "dt": pl.Series(dts, dtype=pl.Datetime),
-            "d":  pl.Series([d.date() for d in dts], dtype=pl.Date),
-            "td": pl.Series([datetime.timedelta(seconds=int(x)) for x in RNG.integers(0, 86400, size)],
-                            dtype=pl.Duration),
-            "t":  pl.Series([datetime.time(h, m, s)
-                              for h, m, s in zip(
-                                  RNG.integers(0, 24, size),
-                                  RNG.integers(0, 60, size),
-                                  RNG.integers(0, 60, size))],
-                             dtype=pl.Time),
-        })
-    if profile == "temporal_nulls":
-        base = datetime.datetime(2000, 1, 1)
-        mask = RNG.random(size) < 0.1
-        dts = [base + datetime.timedelta(seconds=int(x)) for x in RNG.integers(0, 86400 * 365 * 20, size)]
-        dts_n = [None if mask[i] else dts[i] for i in range(size)]
-        dates_n = [None if mask[i] else dts[i].date() for i in range(size)]
-        tds_n = [None if mask[i] else datetime.timedelta(seconds=int(x))
-                 for i, x in enumerate(RNG.integers(0, 86400, size))]
-        times_n = [None if mask[i] else datetime.time(int(h), int(m), int(s))
-                   for i, (h, m, s) in enumerate(zip(RNG.integers(0, 24, size),
-                                                     RNG.integers(0, 60, size),
-                                                     RNG.integers(0, 60, size)))]
-        return pl.DataFrame({
-            "dt": pl.Series(dts_n, dtype=pl.Datetime),
-            "d":  pl.Series(dates_n, dtype=pl.Date),
-            "td": pl.Series(tds_n, dtype=pl.Duration),
-            "t":  pl.Series(times_n, dtype=pl.Time),
-        })
-    if profile == "binary":
-        blobs = [bytes(RNG.integers(0, 256, 64, dtype=np.uint8)) for _ in range(size)]
-        return pl.DataFrame({"b": pl.Series(blobs, dtype=pl.Binary)})
-    if profile == "binary_nulls":
-        blobs = [None if RNG.random() < 0.1 else bytes(RNG.integers(0, 256, 64, dtype=np.uint8))
-                 for _ in range(size)]
-        return pl.DataFrame({"b": pl.Series(blobs, dtype=pl.Binary)})
-    raise ValueError(profile)
-
-
-def make_pandas(polars_df):
-    return polars_df.to_pandas()
-
-
-# ---------------------------------------------------------------------------
-# Benchmark harness
-# ---------------------------------------------------------------------------
-
-_proc = psutil.Process(os.getpid())
-
-def bench(fn, reps=REPS):
-    """Return (mean_ms, delta_mb, total_mb). First rep is a warmup and excluded.
-
-    Memory is measured as RSS (resident set size) so it captures Arrow/Rust/C
-    allocations that tracemalloc misses.  delta_mb is the increase during the
-    call; total_mb is the absolute peak RSS of the process.
-    """
-    times = []
-    delta_mb = 0.0
-    total_mb = 0.0
-    for i in range(reps + 1):
-        gc.collect()
-        rss_before = _proc.memory_info().rss
-        t0 = time.perf_counter()
-        fn()
-        t1 = time.perf_counter()
-        rss_after = _proc.memory_info().rss
-        if i > 0:  # skip warmup
-            times.append(t1 - t0)
-            delta_mb = max(delta_mb, (rss_after - rss_before) / 1024 / 1024)
-            total_mb = max(total_mb, rss_after / 1024 / 1024)
-    return (sum(times) / len(times)) * 1000, delta_mb, total_mb
-
-
-# ---------------------------------------------------------------------------
-# Run
-# ---------------------------------------------------------------------------
-
-def run():
-    profiles = [
-        ("numeric",       "Numeric (int/float/bool), no nulls"),
-        ("numeric_nulls", "Numeric (int/float), ~10% nulls"),
-        ("string",        "String, no nulls"),
-        ("string_nulls",  "String, ~10% nulls"),
-        ("temporal",      "Temporal (datetime/date/duration/time), no nulls"),
-        ("temporal_nulls", "Temporal (datetime/date/duration/time), ~10% nulls"),
-        ("binary",        "Binary (bytes, 64 B each), no nulls"),
-        ("binary_nulls",  "Binary (bytes, 64 B each), ~10% nulls"),
-    ]
-
-    for size in SIZES:
-        print(f"\n{'='*72}")
-        print(f"  {size:,} rows")
-        print(f"{'='*72}")
-
-        for profile, label in profiles:
-            pol_df = make_polars(size, profile)
-            pan_df = make_pandas(pol_df)
-
-            with tempfile.TemporaryDirectory() as tmp:
-                pol_path = f"{tmp}/pol.sbdf"
-                pan_path = f"{tmp}/pan.sbdf"
-
-                # --- Export ---
-                sbdf.export_data(pol_df, pol_path)  # pre-create for import bench
-                sbdf.export_data(pan_df, pan_path)
-
-                exp_pan_ms,  exp_pan_dm,  exp_pan_tm  = bench(lambda: sbdf.export_data(pan_df, f"{tmp}/x.sbdf"))
-                exp_pol_ms,  exp_pol_dm,  exp_pol_tm  = bench(lambda: sbdf.export_data(pol_df, f"{tmp}/x.sbdf"))
-                exp_via_ms,  exp_via_dm,  exp_via_tm  = bench(lambda: sbdf.export_data(pol_df.to_pandas(), f"{tmp}/x.sbdf"))
-
-                # --- Import ---
-                imp_pan_ms,     imp_pan_dm,     imp_pan_tm     = bench(lambda: sbdf.import_data(pan_path))
-                imp_pol_old_ms, imp_pol_old_dm, imp_pol_old_tm = bench(lambda: pl.from_pandas(sbdf.import_data(pan_path)))
-                imp_pol_ms,     imp_pol_dm,     imp_pol_tm     = bench(lambda: sbdf.import_data(pol_path, output_format=sbdf.OutputFormat.POLARS))
-
-            print(f"\n  {label}")
-            print(f"  {'':35s}  {'time (ms)':>10}  {'delta (MB)':>11}  {'total RSS (MB)':>14}")
-            print(f"  {'-'*76}")
-            print(f"  {'Export: pandas df':35s}  {exp_pan_ms:>10.1f}  {exp_pan_dm:>11.1f}  {exp_pan_tm:>14.1f}")
-            print(f"  {'Export: polars df (old: via pandas)':35s}  {exp_via_ms:>10.1f}  {exp_via_dm:>11.1f}  {exp_via_tm:>14.1f}")
-            print(f"  {'Export: polars df (new: direct)':35s}  {exp_pol_ms:>10.1f}  {exp_pol_dm:>11.1f}  {exp_pol_tm:>14.1f}")
-            print(f"  {'Import: -> pandas df':35s}  {imp_pan_ms:>10.1f}  {imp_pan_dm:>11.1f}  {imp_pan_tm:>14.1f}")
-            print(f"  {'Import: -> polars df (old: via pandas)':35s}  {imp_pol_old_ms:>10.1f}  {imp_pol_old_dm:>11.1f}  {imp_pol_old_tm:>14.1f}")
-            print(f"  {'Import: -> polars df (new: direct)':35s}  {imp_pol_ms:>10.1f}  {imp_pol_dm:>11.1f}  {imp_pol_tm:>14.1f}")
-            sys.stdout.flush()
-
-
-if __name__ == "__main__":
-    import sys
-    warnings.filterwarnings("ignore", category=sbdf.SBDFWarning)
-    print(f"Python {sys.version}")
-    print(f"Polars {pl.__version__}  Pandas {pd.__version__}  NumPy {np.__version__}")
-    run()

From 596cfd6001f4aaadeded32756b92ce1b4fd5f865 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 07:56:04 -0500
Subject: [PATCH 32/38] Perf: pass ndarray directly to scatter() instead of
 converting to list

np.where(invalids)[0] returns an ndarray; pl.Series.scatter() accepts it
directly. The .tolist() conversion was allocating an unnecessary Python list
on every null-containing column import.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/sbdf.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spotfire/sbdf.pyx b/spotfire/sbdf.pyx
index cc18cdb..d7ae63e 100644
--- a/spotfire/sbdf.pyx
+++ b/spotfire/sbdf.pyx
@@ -819,7 +819,7 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             values -= _SBDF_TO_UNIX_EPOCH_MS
             col = pl.Series(name=name, values=values, dtype=pl.Int64).cast(pl.Datetime('ms'))
             if invalids.any():
-                col = col.scatter(np.where(invalids)[0].tolist(), None)
+                col = col.scatter(np.where(invalids)[0], None)
 
         elif vt_id == sbdf_c.SBDF_DATETYPEID:
             # _import_vt_date_int32 already converted ms→days and wrote int32 directly.
@@ -828,7 +828,7 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             context.clear_values_arrays()
             col = pl.Series(name=name, values=values, dtype=pl.Date)
             if invalids.any():
-                col = col.scatter(np.where(invalids)[0].tolist(), None)
+                col = col.scatter(np.where(invalids)[0], None)
 
         elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID:
             # Timespans are int64 ms with no epoch bias.  Duration('ms') is int64 in Arrow,
@@ -837,7 +837,7 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             context.clear_values_arrays()
             col = pl.Series(name=name, values=values, dtype=pl.Int64).cast(pl.Duration('ms'))
             if invalids.any():
-                col = col.scatter(np.where(invalids)[0].tolist(), None)
+                col = col.scatter(np.where(invalids)[0], None)
 
         elif vt_id == sbdf_c.SBDF_TIMETYPEID:
             # _import_vt_time_int64 stores int64 ns since midnight (Polars Time internal format).
@@ -852,7 +852,7 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
                 values[invalids] = 0
             col = pl.Series(name=name, values=values, dtype=pl.Time)
             if invalids.any():
-                col = col.scatter(np.where(invalids)[0].tolist(), None)
+                col = col.scatter(np.where(invalids)[0], None)
 
         elif not context.is_object_numpy_type():
             # Numeric types (bool, int, float): numpy → Polars directly; Polars may zero-copy
@@ -860,7 +860,7 @@ cdef object _import_build_polars_dataframe(column_names, importer_contexts):
             values = context.get_values_array()
             col = pl.Series(name=name, values=values, dtype=_import_polars_dtype(context))
             if invalids.any():
-                col = col.scatter(np.where(invalids)[0].tolist(), None)
+                col = col.scatter(np.where(invalids)[0], None)
 
         else:
             # String, time, binary, decimal: Polars requires a Python list (no compatible numpy

From 7efcdc08646674b43acf35f2e5e7b83dce95b8ac Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 10:46:41 -0500
Subject: [PATCH 33/38] Test: add test_polars_string_multichunk to verify Arrow
 buffer chunk-boundary safety

Exports 100_001 rows of a Polars String column, forcing a second SBDF row
slice (start=100_000, count=1), and asserts the value at the chunk boundary
is correct.  Covers the raw C pointer arithmetic in _export_extract_string_obj_arrow
which is not bounds-checked.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 97f85a5..7a484cb 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -807,3 +807,22 @@ def test_set_types_polars_error(self):
         polars_df = pl.DataFrame({"x": [1, 2, 3]})
         with self.assertRaisesRegex(TypeError, "Polars"):
             spotfire.set_spotfire_types(polars_df, {"x": "Integer"})  # type: ignore[arg-type]
+
+    def test_polars_string_multichunk(self):
+        """Verify Polars String exports spanning multiple SBDF row slices give correct values.
+
+        The Arrow buffer path in _export_extract_string_obj_arrow uses raw C pointer
+        arithmetic (values_buf + offsets[idx]).  A second chunk (start=100_000, count=1)
+        verifies the offset into the values buffer is computed correctly when start > 0.
+        """
+        n = 100_001
+        labels = ["a"] * n
+        labels[-1] = "sentinel"
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/multichunk.sbdf"
+            with self.assertWarns(sbdf.SBDFWarning):
+                sbdf.export_data(pl.DataFrame({"s": labels}), path)
+            result = sbdf.import_data(path)
+        self.assertEqual(len(result), n)
+        self.assertEqual(result.at[0, "s"], "a")
+        self.assertEqual(result.at[n - 1, "s"], "sentinel")

From 53cddb51ad56fb28bb54faa22a0d942deefbdefc Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 10:53:04 -0500
Subject: [PATCH 34/38] CI: add no_polars test environment to verify package
 works without polars/pyarrow

polars is an optional dependency; pyarrow only arrives transitively through it.
Adding test_requirements_no_polars.txt causes build.yaml's test-environment matrix
to automatically pick up a second CI slot that runs the full test suite with neither
library installed.  SbdfPolarsTest is skipped via @unittest.skipIf(pl is None, ...);
all Pandas tests must pass.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 test_requirements_no_polars.txt | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 test_requirements_no_polars.txt

diff --git a/test_requirements_no_polars.txt b/test_requirements_no_polars.txt
new file mode 100644
index 0000000..73ab30d
--- /dev/null
+++ b/test_requirements_no_polars.txt
@@ -0,0 +1,6 @@
+html-testRunner
+geopandas
+matplotlib
+pillow
+seaborn
+shapely
\ No newline at end of file

From fb654897036af44d233cc9a73b505ba45c177d44 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 19:02:16 -0500
Subject: [PATCH 35/38] Test: cross-path equivalence for all dtypes with
 scattered nulls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two tests to SbdfPolarsTest verifying that the Polars and Pandas
import/export code paths produce identical data for all 11 non-Decimal
SBDF value types with one null per column (rotating positions 0–4):

- test_all_dtypes_export_polars_vs_pandas_path: exports the same data via
  the native Polars path and the Pandas path, imports both back as Pandas,
  and asserts frame equality.

- test_all_dtypes_import_polars_vs_pandas_path: imports a single SBDF file
  as both a Polars and a Pandas DataFrame, then compares null positions and
  non-null values column by column.

Helpers:
- _all_dtypes_polars_df(): canonical Polars source with all SBDF-compatible types.
- _all_dtypes_pandas_df(): equivalent Pandas source (avoids pyarrow dependency).
- _assert_import_paths_equivalent(): per-column null + value comparison using
  Series.to_list(), which works without pyarrow.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 143 +++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 7a484cb..6bc2dc2 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -826,3 +826,146 @@ def test_polars_string_multichunk(self):
         self.assertEqual(len(result), n)
         self.assertEqual(result.at[0, "s"], "a")
         self.assertEqual(result.at[n - 1, "s"], "sentinel")
+
+    # Cross-path equivalence tests
+
+    @staticmethod
+    def _all_dtypes_polars_df():
+        """Build a canonical Polars DataFrame covering all 11 non-Decimal SBDF types.
+
+        Each column has exactly one null at a distinct row index (rotating 0–4) so every
+        row contains both valid and null values.  Non-null values cover negatives, pre-epoch
+        timestamps, edge times, and raw bytes to exercise the full value range.
+        """
+        dt = datetime.datetime
+        d = datetime.date
+        t = datetime.time
+        td = datetime.timedelta
+        return pl.DataFrame([
+            pl.Series("bool_col",     [None, True, False, True, False],
+                      dtype=pl.Boolean),
+            pl.Series("int32_col",    [1, None, -2, 3, -4],
+                      dtype=pl.Int32),
+            pl.Series("int64_col",    [1, 2_000_000_000, None, -3_000_000_000, 4],
+                      dtype=pl.Int64),
+            pl.Series("float32_col",  [1.5, -2.5, 3.5, None, 5.5],
+                      dtype=pl.Float32),
+            pl.Series("float64_col",  [1.0, -2.0, 3.0, -4.0, None],
+                      dtype=pl.Float64),
+            pl.Series("datetime_col", [None,
+                                       dt(2020, 1, 1, 12, 0, 0),
+                                       dt(1969, 7, 20, 20, 17, 0),
+                                       dt(2024, 12, 31, 23, 59, 59),
+                                       dt(1583, 1, 2, 0, 0, 0)],
+                      dtype=pl.Datetime("ms")),
+            pl.Series("date_col",     [d(2020, 1, 1), None, d(1969, 7, 20),
+                                       d(2024, 12, 31), d(1583, 1, 2)],
+                      dtype=pl.Date),
+            pl.Series("time_col",     [t(12, 0, 0), t(0, 0, 0), None, t(23, 59, 59), t(6, 30)],
+                      dtype=pl.Time),
+            pl.Series("duration_col", [td(days=1), td(seconds=30), td(days=-1), None, td(hours=2)],
+                      dtype=pl.Duration("ms")),
+            pl.Series("string_col",   ["hello", "world", "foo", "bar", None],
+                      dtype=pl.String),
+            pl.Series("binary_col",   [None, b"\x00\x01", b"\xff", b"", b"\xde\xad"],
+                      dtype=pl.Binary),
+        ])
+
+    @staticmethod
+    def _all_dtypes_pandas_df():
+        """Build the Pandas equivalent of ``_all_dtypes_polars_df()``.
+
+        Mirrors the same 5 rows, 11 columns, and null positions using Pandas nullable
+        dtypes so both DataFrames produce identical SBDF files when exported.  Float columns
+        use numpy NaN (not pd.NA) to match what the Polars export path stores for missing
+        floating-point values.
+
+        Note: ``polars.DataFrame.to_pandas()`` requires pyarrow, which is not part of the
+        required dependencies.  This helper provides the same data without that dependency.
+        """
+        dt = datetime.datetime
+        d = datetime.date
+        t = datetime.time
+        td = datetime.timedelta
+        return pd.DataFrame({
+            "bool_col":     pd.array([None, True, False, True, False],  dtype="boolean"),
+            "int32_col":    pd.array([1, None, -2, 3, -4],              dtype="Int32"),
+            "int64_col":    pd.array([1, 2_000_000_000, None, -3_000_000_000, 4], dtype="Int64"),
+            "float32_col":  np.array([1.5, -2.5, 3.5, np.nan, 5.5],    dtype="float32"),
+            "float64_col":  np.array([1.0, -2.0, 3.0, -4.0, np.nan],   dtype="float64"),
+            "datetime_col": pd.array([pd.NaT,
+                                      dt(2020, 1, 1, 12, 0, 0),
+                                      dt(1969, 7, 20, 20, 17, 0),
+                                      dt(2024, 12, 31, 23, 59, 59),
+                                      dt(1583, 1, 2, 0, 0, 0)],        dtype="datetime64[ms]"),
+            "date_col":     [d(2020, 1, 1), None, d(1969, 7, 20), d(2024, 12, 31), d(1583, 1, 2)],
+            "time_col":     [t(12, 0, 0), t(0, 0, 0), None, t(23, 59, 59), t(6, 30)],
+            "duration_col": pd.array([td(days=1), td(seconds=30), td(days=-1), pd.NaT, td(hours=2)],
+                                     dtype="timedelta64[ms]"),
+            "string_col":   ["hello", "world", "foo", "bar", None],
+            "binary_col":   [None, b"\x00\x01", b"\xff", b"", b"\xde\xad"],
+        })
+
+    def test_all_dtypes_export_polars_vs_pandas_path(self):
+        """Exporting via the native Polars path and the Pandas path should produce identical data.
+
+        The Polars DataFrame and an equivalent Pandas DataFrame (same values, same nulls) are
+        each exported to a separate SBDF file.  Both files are then imported back as Pandas and
+        compared element-wise, covering all 11 non-Decimal SBDF types with one null per column.
+        """
+        pl_df = self._all_dtypes_polars_df()
+        pd_df = self._all_dtypes_pandas_df()
+        with tempfile.TemporaryDirectory() as tempdir:
+            polars_path = f"{tempdir}/via_polars.sbdf"
+            pandas_path = f"{tempdir}/via_pandas.sbdf"
+            sbdf.export_data(pl_df, polars_path)
+            sbdf.export_data(pd_df, pandas_path)
+            pd_from_polars = sbdf.import_data(polars_path)
+            pd_from_pandas = sbdf.import_data(pandas_path)
+        pdtest.assert_frame_equal(
+            pd_from_polars, pd_from_pandas,
+            check_dtype=False, check_exact=False, rtol=1e-5,
+        )
+
+    def _assert_import_paths_equivalent(self, polars_result, pandas_result):
+        """Assert that a Polars import result and a Pandas import result contain identical data.
+
+        Uses ``Series.to_list()`` (no pyarrow required) to materialise Polars values as Python
+        objects and compares them against the corresponding Pandas column values.  Null
+        positions are verified with ``Series.is_null()`` / ``Series.isna()``, and non-null
+        float values are compared with a relative tolerance to absorb float32 representation
+        differences.
+        """
+        self.assertEqual(list(polars_result.columns), list(pandas_result.columns))
+        for col in polars_result.columns:
+            pl_series = polars_result[col]
+            pd_series = pandas_result[col]
+            pl_nulls = pl_series.is_null().to_list()
+            pd_nulls = pd_series.isna().tolist()
+            self.assertEqual(pl_nulls, pd_nulls, f"column '{col}': null positions differ")
+            pl_vals = [v for v in pl_series.to_list() if v is not None]
+            pd_vals = [v for v in pd_series.dropna().tolist() if v is not None]
+            self.assertEqual(len(pl_vals), len(pd_vals),
+                             f"column '{col}': non-null value counts differ")
+            dtype_name = pl_series.dtype.__class__.__name__
+            if dtype_name in ("Float32", "Float64"):
+                for pv, pdv in zip(pl_vals, pd_vals):
+                    self.assertAlmostEqual(float(pv), float(pdv), places=4,
+                                          msg=f"column '{col}': value mismatch")
+            else:
+                self.assertEqual(pl_vals, pd_vals, f"column '{col}': values differ")
+
+    def test_all_dtypes_import_polars_vs_pandas_path(self):
+        """Importing the same SBDF via the Polars and Pandas paths should yield equivalent data.
+
+        The same SBDF file is imported twice — once as a native Polars DataFrame and once as a
+        Pandas DataFrame — then compared column by column using ``Series.to_list()`` (no
+        pyarrow required).  Covers all 11 non-Decimal SBDF types with one null per column.
+        """
+        pl_df = self._all_dtypes_polars_df()
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = f"{tempdir}/source.sbdf"
+            sbdf.export_data(pl_df, path)
+            polars_result = sbdf.import_data(path, output_format=sbdf.OutputFormat.POLARS)
+            pandas_result = sbdf.import_data(path)
+        self._assert_import_paths_equivalent(polars_result, pandas_result)

From b87384849a5cbb942d229b6d2833e9840a6b2888 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 19:12:26 -0500
Subject: [PATCH 36/38] Fix: shorten over-long test method names and rename
 2-char variable for pylint

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 6bc2dc2..94872cc 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -906,7 +906,7 @@ def _all_dtypes_pandas_df():
             "binary_col":   [None, b"\x00\x01", b"\xff", b"", b"\xde\xad"],
         })
 
-    def test_all_dtypes_export_polars_vs_pandas_path(self):
+    def test_all_dtypes_polars_export(self):
         """Exporting via the native Polars path and the Pandas path should produce identical data.
 
         The Polars DataFrame and an equivalent Pandas DataFrame (same values, same nulls) are
@@ -949,13 +949,13 @@ def _assert_import_paths_equivalent(self, polars_result, pandas_result):
                              f"column '{col}': non-null value counts differ")
             dtype_name = pl_series.dtype.__class__.__name__
             if dtype_name in ("Float32", "Float64"):
-                for pv, pdv in zip(pl_vals, pd_vals):
-                    self.assertAlmostEqual(float(pv), float(pdv), places=4,
+                for pl_val, pdv in zip(pl_vals, pd_vals):
+                    self.assertAlmostEqual(float(pl_val), float(pdv), places=4,
                                           msg=f"column '{col}': value mismatch")
             else:
                 self.assertEqual(pl_vals, pd_vals, f"column '{col}': values differ")
 
-    def test_all_dtypes_import_polars_vs_pandas_path(self):
+    def test_all_dtypes_polars_import(self):
         """Importing the same SBDF via the Polars and Pandas paths should yield equivalent data.
 
         The same SBDF file is imported twice — once as a native Polars DataFrame and once as a

From c91fd1a95ec63407a591992515bdcc6a112384bb Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 19:24:31 -0500
Subject: [PATCH 37/38] Fix: add type: ignore[call-overload] for pd.array
 timedelta64 mypy overload

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 94872cc..4faf5b5 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -900,7 +900,7 @@ def _all_dtypes_pandas_df():
                                       dt(1583, 1, 2, 0, 0, 0)],        dtype="datetime64[ms]"),
             "date_col":     [d(2020, 1, 1), None, d(1969, 7, 20), d(2024, 12, 31), d(1583, 1, 2)],
             "time_col":     [t(12, 0, 0), t(0, 0, 0), None, t(23, 59, 59), t(6, 30)],
-            "duration_col": pd.array([td(days=1), td(seconds=30), td(days=-1), pd.NaT, td(hours=2)],
+            "duration_col": pd.array([td(days=1), td(seconds=30), td(days=-1), pd.NaT, td(hours=2)],  # type: ignore[call-overload]
                                      dtype="timedelta64[ms]"),
             "string_col":   ["hello", "world", "foo", "bar", None],
             "binary_col":   [None, b"\x00\x01", b"\xff", b"", b"\xde\xad"],

From fb3760b3168129ab658b62d330bd91b23099da02 Mon Sep 17 00:00:00 2001
From: stewjb <jeffreyrs@gmail.com>
Date: Sat, 4 Apr 2026 19:48:09 -0500
Subject: [PATCH 38/38] Fix: move type: ignore comment to continuation line to
 fix line-too-long (131/120)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 spotfire/test/test_sbdf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spotfire/test/test_sbdf.py b/spotfire/test/test_sbdf.py
index 4faf5b5..2d220ec 100644
--- a/spotfire/test/test_sbdf.py
+++ b/spotfire/test/test_sbdf.py
@@ -900,8 +900,8 @@ def _all_dtypes_pandas_df():
                                       dt(1583, 1, 2, 0, 0, 0)],        dtype="datetime64[ms]"),
             "date_col":     [d(2020, 1, 1), None, d(1969, 7, 20), d(2024, 12, 31), d(1583, 1, 2)],
             "time_col":     [t(12, 0, 0), t(0, 0, 0), None, t(23, 59, 59), t(6, 30)],
-            "duration_col": pd.array([td(days=1), td(seconds=30), td(days=-1), pd.NaT, td(hours=2)],  # type: ignore[call-overload]
-                                     dtype="timedelta64[ms]"),
+            "duration_col": pd.array([td(days=1), td(seconds=30), td(days=-1), pd.NaT, td(hours=2)],
+                                     dtype="timedelta64[ms]"),  # type: ignore[call-overload]
             "string_col":   ["hello", "world", "foo", "bar", None],
             "binary_col":   [None, b"\x00\x01", b"\xff", b"", b"\xde\xad"],
         })