Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 73 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
# NetCDF Metadata
# Dataset Profile

## Overview

A utility to describe the structure of NetCDF4 datasets.

Reads a NetCDF4 file and reports the group structure and information
about any dimensions, variables, and attributes that are defined.
A utility to describe the structure of datasets in netCDF, GeoTiff,
and ESRI Shapefile format.

## Installation

Expand All @@ -19,27 +17,45 @@ The optional test suite may be installed and run with:

```bash
$ python -m pip install .[test]
$ pytest --cov=ncmetadata tests
$ pytest --cov=dsprofile tests
```

## Usage

```bash
usage: ncmetadata [-h] [-o {category,group}] [-e <group0>,<group1>,...] [-m] [-d] filename
usage: dsprofile [-h] {netcdf,geotiff,shape} ...

Describes datasets in a variety of formats

options:
-h, --help show this help message and exit

Dataset formats:
{netcdf,geotiff,shape}
netcdf Extracts metadata from netCDF4 files
geotiff Extracts metadata from GeoTIFF files
shape Extracts metadata from ESRI Shape files
```

## NetCDF Options

Reads a netCDF4 file and reports the group structure and information
about any dimensions, variables, and attributes that are defined.

Extracts metadata from netCDF4 files
```bash
usage: dsprofile netcdf [-h] [-o {category,group}] [-e <group0>,<group1>,...] [-m] [-d] filename

positional arguments:
filename

options:
-h, --help show this help message and exit
-o {category,group} --order-by {category,group}
(default group)
-h, --help show this help message and exit
-o {category,group}, --order-by {category,group}
(default group)
-e <group0>,<group1>,..., --exclude-groups <group0>,<group1>,...
Exclude each of the named <group> arguments
-m, --omit-metadata Output only netCDF file contents, not file metadata
-d, --omit-digest Do not include a hash digest in file metadata
Exclude each of the named <group> arguments
-m, --omit-metadata Output only netCDF file contents, not file metadata
-d, --omit-digest Do not include a hash digest in file metadata
```

The `--order-by` option allows the resulting output to be arranged in one of two ways:
Expand All @@ -53,11 +69,52 @@ The `--omit-digest` option prevents calculation of a SHA256 hash for the process
This may be desirable for very large files or test workflows to avoid the potentially
time-consuming hashing operation.

## Example
### NetCDF Example

For example, to report on the contents of the netCDF4 file `test.nc` using the default
output options...

```bash
$ ncmetadata test.nc
$ dsprofile netcdf test.nc
```

## GeoTiff Options

```bash
usage: dsprofile geotiff [-h] [-m] [-d] filename

positional arguments:
filename

options:
-h, --help show this help message and exit
-m, --omit-metadata Output only GeoTIFF file contents, not file metadata
-d, --omit-digest Do not include a hash digest in file metadata
```

## ESRI Shapefile Options

```bash
usage: dsprofile shape [-h] [-m] [-d] filename

positional arguments:
filename

options:
-h, --help show this help message and exit
-m, --omit-metadata Output only Shape file contents, not file metadata
-d, --omit-digest Do not include a hash digest in file metadata
```

A Shapefile may be read by opening any of its components, for example...

```bash
$ dsprofile shape shapefile.shp
```
...is equivalent to...

```bash
$ dsprofile shape shapefile.dbf
```
Note however that where a hex digest of a hash is included in the output,
this will refer only to file provided as a command-line argument.
File renamed without changes.
9 changes: 9 additions & 0 deletions dsprofile/lib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .reader import (
Reader,
reader_type_map,
make_reader
)

from .netcdf import NetCDFReader
from .tiff import GeoTIFFReader
from .shape import ShapefileReader
194 changes: 194 additions & 0 deletions dsprofile/lib/netcdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#from dsprofile.util import (
# read_dataset
#)
import pathlib
import sys
import weakref

from collections.abc import Sequence

from dsprofile.lib import Reader

import netCDF4 as nc


exclude_groups = []


def walk_groups_breadth_first(ds):
yield ds.groups.values()
for group in ds.groups.values():
yield from walk_groups_breadth_first(group)


def walk_groups_depth_first(ds):
for group in ds.groups.values():
yield from walk_groups_depth_first(group)
yield ds.groups.values()


def walk_groups_ordered(ds):
for group in ds.groups.values():
if group.path in exclude_groups:
continue
yield from walk_groups_ordered(group)
yield ds


walk_func_map = {
"breadth": walk_groups_breadth_first,
"depth": walk_groups_depth_first,
"ordered": walk_groups_ordered
}


def walk_groups(ds, order="ordered"):
return walk_func_map[order](ds)


class NetCDFReader(Reader):

format = "netcdf"

def __init__(self, filename, order_by="group", exclude=None):
self.ds = self.__class__.read_dataset(filename)
self._finalizer = weakref.finalize(self, self.finalize_close, self.ds)
self.order_by = order_by
# Note that the order is significant here
# a str is a Sequence type
if not exclude:
self.exclude_groups = []
elif isinstance(exclude, str):
self.exclude_groups = [exclude]
elif issubclass(type(exclude), Sequence):
self.exclude_groups = exclude
# TODO: Questionable...
global exclude_groups
exclude_groups = self.exclude_groups

@staticmethod
def read_dataset(filename):
"""
Handle OSError, PermissionError, FileNotFoundError neatly
Inform neatly for non-netCDF4 files
Allow all other exceptions to raise unhandled
"""

try:
ds = nc.Dataset(filename, 'r')
except (OSError, PermissionError, FileNotFoundError) as e:
print(f"{e.strerror} for file '{filename}'", file=sys.stderr)
sys.exit(1)

if ds.data_model != "NETCDF4":
print(f"File '{filename}' has format '{ds.data_model}', "
f"not 'NETCDF4' as required", file=sys.stderr)
ds.close()
sys.exit(1)

return ds

@staticmethod
def finalize_close(ncdf):
if isinstance(ncdf, nc.Dataset) and ncdf.isopen():
ncdf.close()

def close(self):
if self._finalizer.alive:
self._finalizer()

def gather_by_group(self):
"""
A categorisation of dimensions, variables, and
attributes defined in the <ds> Dataset argument,
ordered by the group to which they belong.
"""
dims = self.describe_dimensions()
ncvars = self.describe_variables()
attrs = self.describe_attributes()
by_group = {}
for group in walk_groups(self.ds):
by_group[group.path] = {
"dimensions": dims[group.path],
"variables": ncvars[group.path],
"attributes": attrs[group.path]
}

return by_group

def gather_by_type(self):
"""
A categorisation of dimensions, variables, and
attributes defined in the <ds> Dataset argument,
ordered by type.
"""
return {
"dimensions": self.describe_dimensions(),
"variables": self.describe_variables(),
"attributes": self.describe_attributes()
}

process_func_map = {
"category": gather_by_type,
"group": gather_by_group
}

def describe_dimensions(self):
dimensions = {}

for group in walk_groups(self.ds):
dimensions[group.path] = {d.name: {"size": d.size} for d in group.dimensions.values()}

return dimensions


def describe_variables(self):
variables = {}
for group in walk_groups(self.ds):
variables[group.path] = {v.name: {"dtype": v.dtype.name,
"dimensions": v.dimensions,
"fill_value": str(v.get_fill_value())}
for v in group.variables.values()}
return variables

def describe_attributes(self):
attrs = {}
for group in walk_groups(self.ds):
attrs[group.path] = {"group": [a for a in group.ncattrs()],
"vars": {v.name: [a for a in v.ncattrs()] for v in group.variables.values()}
}
return attrs

def process(self):
return self.process_func_map[self.order_by](self)

@classmethod
def build_subparser(cls, sp):
parser = sp.add_parser(cls.format,
help="Extracts metadata from netCDF4 files")
parser.add_argument("filename", type=pathlib.Path)
parser.add_argument("-o", "--order-by", choices=["category", "group"],
default="group", help="(default group)")
parser.add_argument("-e", "--exclude-groups", metavar="<group0>,<group1>,...",
help="Exclude each of the named <group> arguments")
parser.add_argument("-m", "--omit-metadata", action="store_true",
help="Output only netCDF file contents, not file metadata")
parser.add_argument("-d", "--omit-digest", action="store_true",
help="Do not include a hash digest in file metadata")
return parser

@classmethod
def handle_args(cls, args):
if args.filename.is_dir():
print(f"A valid file is required not directory '{args.filename}'",
file=sys.stderr)
sys.exit(1)

exclude = []
if hasattr(args, "exclude_groups"):
exclude = args.exclude_groups.split(',') if args.exclude_groups else []

ctor_args = [args.filename, args.order_by, exclude]
ctor_kwargs = {}

return ctor_args, ctor_kwargs
40 changes: 40 additions & 0 deletions dsprofile/lib/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from abc import (
ABC,
abstractmethod
)


reader_type_map = {}


class Reader(ABC):
"""
An abstract base for all Reader types.
"""

subclass_type_key = "format"

def __init_subclass__(cls, /, **kwargs):
super().__init_subclass__(**kwargs)
keyattr = __class__.subclass_type_key
reader_type = getattr(cls, keyattr, None)
if not reader_type or not isinstance(reader_type, str):
raise NotImplementedError(f"Reader subclass {cls.__qualname__} "
f"does not define a {reader_type} key")
reader_type_map[reader_type] = cls


@abstractmethod
def process(self):
pass

@classmethod
@abstractmethod
def handle_args(cls, args):
pass


def make_reader(args):
cls = reader_type_map[args.command]
ctor_args, ctor_kwargs = cls.handle_args(args)
return cls(*ctor_args, **ctor_kwargs)
Loading