eScienceLab · pmslavin · Feb 23, 2026 · Feb 24, 2026 · Feb 27, 2026
diff --git a/README.md b/README.md
@@ -1,11 +1,9 @@
-# NetCDF Metadata
+# Dataset Profile
 
 ## Overview
 
-A utility to describe the structure of NetCDF4 datasets.
-
-Reads a NetCDF4 file and reports the group structure and information
-about any dimensions, variables, and attributes that are defined.
+A utility to describe the structure of datasets in netCDF, GeoTiff,
+and ESRI Shapefile format.
 
 ## Installation
 
@@ -19,27 +17,45 @@ The optional test suite may be installed and run with:
 
 ```bash
 $ python -m pip install .[test]
-$ pytest --cov=ncmetadata tests
+$ pytest --cov=dsprofile tests
 ```
 
 ## Usage
 
 ```bash
-usage: ncmetadata [-h] [-o {category,group}] [-e <group0>,<group1>,...] [-m] [-d] filename
+usage: dsprofile [-h] {netcdf,geotiff,shape} ...
+
+Describes datasets in a variety of formats
+
+options:
+  -h, --help           show this help message and exit
+
+Dataset formats:
+  {netcdf,geotiff,shape}
+    netcdf             Extracts metadata from netCDF4 files
+    geotiff            Extracts metadata from GeoTIFF files
+    shape              Extracts metadata from ESRI Shape files
+```
+
+## NetCDF Options
+
+Reads a netCDF4 file and reports the group structure and information
+about any dimensions, variables, and attributes that are defined.
 
-Extracts metadata from netCDF4 files
+```bash
+usage: dsprofile netcdf [-h] [-o {category,group}] [-e <group0>,<group1>,...] [-m] [-d] filename
 
 positional arguments:
   filename
 
 options:
-  -h, --help                show this help message and exit
-  -o {category,group}       --order-by {category,group}
-                            (default group)
+  -h, --help            show this help message and exit
+  -o {category,group}, --order-by {category,group}
+                        (default group)
   -e <group0>,<group1>,..., --exclude-groups <group0>,<group1>,...
-                            Exclude each of the named <group> arguments
-  -m, --omit-metadata       Output only netCDF file contents, not file metadata
-  -d, --omit-digest         Do not include a hash digest in file metadata
+                        Exclude each of the named <group> arguments
+  -m, --omit-metadata   Output only netCDF file contents, not file metadata
+  -d, --omit-digest     Do not include a hash digest in file metadata
 ```
 
 The `--order-by` option allows the resulting output to be arranged in one of two ways:
@@ -53,11 +69,52 @@ The `--omit-digest` option prevents calculation of a SHA256 hash for the process
 This may be desirable for very large files or test workflows to avoid the potentially
 time-consuming hashing operation.
 
-## Example
+### NetCDF Example
 
 For example, to report on the contents of the netCDF4 file `test.nc` using the default
 output options...
 
 ```bash
-$ ncmetadata test.nc
+$ dsprofile netcdf test.nc
+```
+
+## GeoTiff Options
+
+```bash
+usage: dsprofile geotiff [-h] [-m] [-d] filename
+
+positional arguments:
+  filename
+
+options:
+  -h, --help           show this help message and exit
+  -m, --omit-metadata  Output only GeoTIFF file contents, not file metadata
+  -d, --omit-digest    Do not include a hash digest in file metadata
+```
+
+## ESRI Shapefile Options
+
+```bash
+usage: dsprofile shape [-h] [-m] [-d] filename
+
+positional arguments:
+  filename
+
+options:
+  -h, --help           show this help message and exit
+  -m, --omit-metadata  Output only Shape file contents, not file metadata
+  -d, --omit-digest    Do not include a hash digest in file metadata
+```
+
+A Shapefile may be read by opening any of its components, for example...
+
+```bash
+$ dsprofile shape shapefile.shp
+```
+...is equivalent to...
+
+```bash
+$ dsprofile shape shapefile.dbf
 ```
+Note however that where a hex digest of a hash is included in the output,
+this will refer only to file provided as a command-line argument.
diff --git a/ncmetadata/__init__.py → dsprofile/__init__.py b/ncmetadata/__init__.py → dsprofile/__init__.py
diff --git a/dsprofile/lib/__init__.py b/dsprofile/lib/__init__.py
@@ -0,0 +1,9 @@
+from .reader import (
+    Reader,
+    reader_type_map,
+    make_reader
+)
+
+from .netcdf import NetCDFReader
+from .tiff import GeoTIFFReader
+from .shape import ShapefileReader
diff --git a/dsprofile/lib/netcdf.py b/dsprofile/lib/netcdf.py
@@ -0,0 +1,194 @@
+#from dsprofile.util import (
+#    read_dataset
+#)
+import pathlib
+import sys
+import weakref
+
+from collections.abc import Sequence
+
+from dsprofile.lib import Reader
+
+import netCDF4 as nc
+
+
+exclude_groups = []
+
+
+def walk_groups_breadth_first(ds):
+    yield ds.groups.values()
+    for group in ds.groups.values():
+        yield from walk_groups_breadth_first(group)
+
+
+def walk_groups_depth_first(ds):
+    for group in ds.groups.values():
+        yield from walk_groups_depth_first(group)
+    yield ds.groups.values()
+
+
+def walk_groups_ordered(ds):
+    for group in ds.groups.values():
+        if group.path in exclude_groups:
+            continue
+        yield from walk_groups_ordered(group)
+    yield ds
+
+
+walk_func_map = {
+    "breadth": walk_groups_breadth_first,
+    "depth": walk_groups_depth_first,
+    "ordered": walk_groups_ordered
+}
+
+
+def walk_groups(ds, order="ordered"):
+    return walk_func_map[order](ds)
+
+
+class NetCDFReader(Reader):
+
+    format = "netcdf"
+
+    def __init__(self, filename, order_by="group", exclude=None):
+        self.ds = self.__class__.read_dataset(filename)
+        self._finalizer = weakref.finalize(self, self.finalize_close, self.ds)
+        self.order_by = order_by
+        # Note that the order is significant here
+        # a str is a Sequence type
+        if not exclude:
+            self.exclude_groups = []
+        elif isinstance(exclude, str):
+            self.exclude_groups = [exclude]
+        elif issubclass(type(exclude), Sequence):
+            self.exclude_groups = exclude
+        # TODO: Questionable...
+        global exclude_groups
+        exclude_groups = self.exclude_groups
+
+    @staticmethod
+    def read_dataset(filename):
+        """
+          Handle OSError, PermissionError, FileNotFoundError neatly
+          Inform neatly for non-netCDF4 files
+          Allow all other exceptions to raise unhandled
+        """
+
+        try:
+            ds = nc.Dataset(filename, 'r')
+        except (OSError, PermissionError, FileNotFoundError) as e:
+            print(f"{e.strerror} for file '{filename}'", file=sys.stderr)
+            sys.exit(1)
+
+        if ds.data_model != "NETCDF4":
+            print(f"File '{filename}' has format '{ds.data_model}', "
+                  f"not 'NETCDF4' as required", file=sys.stderr)
+            ds.close()
+            sys.exit(1)
+
+        return ds
+
+    @staticmethod
+    def finalize_close(ncdf):
+        if isinstance(ncdf, nc.Dataset) and ncdf.isopen():
+            ncdf.close()
+
+    def close(self):
+        if self._finalizer.alive:
+            self._finalizer()
+
+    def gather_by_group(self):
+        """
+          A categorisation of dimensions, variables, and
+          attributes defined in the <ds> Dataset argument,
+          ordered by the group to which they belong.
+        """
+        dims = self.describe_dimensions()
+        ncvars = self.describe_variables()
+        attrs = self.describe_attributes()
+        by_group = {}
+        for group in walk_groups(self.ds):
+            by_group[group.path] = {
+                "dimensions": dims[group.path],
+                "variables": ncvars[group.path],
+                "attributes": attrs[group.path]
+            }
+
+        return by_group
+
+    def gather_by_type(self):
+        """
+          A categorisation of dimensions, variables, and
+          attributes defined in the <ds> Dataset argument,
+          ordered by type.
+        """
+        return {
+            "dimensions": self.describe_dimensions(),
+            "variables": self.describe_variables(),
+            "attributes": self.describe_attributes()
+        }
+
+    process_func_map = {
+        "category": gather_by_type,
+        "group": gather_by_group
+    }
+
+    def describe_dimensions(self):
+        dimensions = {}
+
+        for group in walk_groups(self.ds):
+            dimensions[group.path] = {d.name: {"size": d.size} for d in group.dimensions.values()}
+
+        return dimensions
+
+
+    def describe_variables(self):
+        variables = {}
+        for group in walk_groups(self.ds):
+            variables[group.path] = {v.name: {"dtype": v.dtype.name,
+                                              "dimensions": v.dimensions,
+                                              "fill_value": str(v.get_fill_value())}
+                                     for v in group.variables.values()}
+        return variables
+
+    def describe_attributes(self):
+        attrs = {}
+        for group in walk_groups(self.ds):
+            attrs[group.path] = {"group": [a for a in group.ncattrs()],
+                                 "vars": {v.name: [a for a in v.ncattrs()] for v in group.variables.values()}
+                                }
+        return attrs
+
+    def process(self):
+        return self.process_func_map[self.order_by](self)
+
+    @classmethod
+    def build_subparser(cls, sp):
+        parser = sp.add_parser(cls.format,
+                               help="Extracts metadata from netCDF4 files")
+        parser.add_argument("filename", type=pathlib.Path)
+        parser.add_argument("-o", "--order-by", choices=["category", "group"],
+                            default="group", help="(default group)")
+        parser.add_argument("-e", "--exclude-groups", metavar="<group0>,<group1>,...",
+                            help="Exclude each of the named <group> arguments")
+        parser.add_argument("-m", "--omit-metadata", action="store_true",
+                            help="Output only netCDF file contents, not file metadata")
+        parser.add_argument("-d", "--omit-digest", action="store_true",
+                            help="Do not include a hash digest in file metadata")
+        return parser
+
+    @classmethod
+    def handle_args(cls, args):
+        if args.filename.is_dir():
+            print(f"A valid file is required not directory '{args.filename}'",
+                  file=sys.stderr)
+            sys.exit(1)
+
+        exclude = []
+        if hasattr(args, "exclude_groups"):
+            exclude = args.exclude_groups.split(',') if args.exclude_groups else []
+
+        ctor_args = [args.filename, args.order_by, exclude]
+        ctor_kwargs = {}
+
+        return ctor_args, ctor_kwargs
diff --git a/dsprofile/lib/reader.py b/dsprofile/lib/reader.py
@@ -0,0 +1,40 @@
+from abc import (
+    ABC,
+    abstractmethod
+)
+
+
+reader_type_map = {}
+
+
+class Reader(ABC):
+    """
+      An abstract base for all Reader types.
+    """
+
+    subclass_type_key = "format"
+
+    def __init_subclass__(cls, /, **kwargs):
+        super().__init_subclass__(**kwargs)
+        keyattr = __class__.subclass_type_key
+        reader_type = getattr(cls, keyattr, None)
+        if not reader_type or not isinstance(reader_type, str):
+            raise NotImplementedError(f"Reader subclass {cls.__qualname__} "
+                                      f"does not define a {reader_type} key")
+        reader_type_map[reader_type] = cls
+
+
+    @abstractmethod
+    def process(self):
+        pass
+
+    @classmethod
+    @abstractmethod
+    def handle_args(cls, args):
+        pass
+
+
+def make_reader(args):
+    cls = reader_type_map[args.command]
+    ctor_args, ctor_kwargs = cls.handle_args(args)
+    return cls(*ctor_args, **ctor_kwargs)