diff --git a/docs/conf.py b/docs/conf.py index 5a214e555..a66c4aaa7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,7 +43,7 @@ '_build', 'Thumbs.db', '.DS_Store', - 'tutorials/dataset_basic_tutorial.md', + '**/*_tutorial.md', # ipynb files will be used instead. ] # Suppress warning in exception basic_data_tutorial @@ -116,6 +116,8 @@ 'tutorials/data_sources/bagz_data_source_tutorial.ipynb', 'tutorials/data_sources/huggingface_dataset_tutorial.ipynb', 'tutorials/data_sources/pytorch_dataset_tutorial.ipynb', + 'tutorials/performance_debugging.ipynb', + 'dataset/performance_debugging.ipynb', ] diff --git a/docs/grain.data_loader.rst b/docs/grain.data_loader.rst index 9f9004136..13ed7efe2 100644 --- a/docs/grain.data_loader.rst +++ b/docs/grain.data_loader.rst @@ -1,5 +1,5 @@ ``grain`` DataLoader -================= +==================== .. automodule:: grain._src.python.data_loader .. currentmodule:: grain diff --git a/docs/index.md b/docs/index.md index b13a0826a..c442fdb7b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -44,6 +44,7 @@ not depend on TensorFlow. :maxdepth: 1 :hidden: :caption: Get started +overview installation api_choice ``` diff --git a/docs/tutorials/data_loader_tutorial.md b/docs/tutorials/data_loader_tutorial.md index a2d3a9d67..642731695 100644 --- a/docs/tutorials/data_loader_tutorial.md +++ b/docs/tutorials/data_loader_tutorial.md @@ -11,6 +11,8 @@ kernelspec: name: python3 --- + + +++ {"id": "qGiXX-sg4l9o"} # `DataLoader` guide @@ -96,7 +98,7 @@ index_sampler = grain.IndexSampler( ## Data source A data source is responsible for reading indvidual records from underlying files / storage system. We provide the following data sources: -* `ArrayRecordDataSource`: reads records from [ArrayRecord](go/array-record-design) files. +* `ArrayRecordDataSource`: reads records from [ArrayRecord](https://github.com/google/array_record) files. * `tfds.data_source`: data source for [TFDS](https://www.tensorflow.org/datasets) datasets without a TensorFlow dependency. @@ -106,7 +108,7 @@ Below, we show an example using a TFDS data source, but using other data sources ## TFDS Data source -```{code-cell} +``` {code-cell} --- executionInfo: elapsed: 38785 diff --git a/grain/_src/python/dataset/dataset.py b/grain/_src/python/dataset/dataset.py index 1250a23cd..aebcecaa1 100644 --- a/grain/_src/python/dataset/dataset.py +++ b/grain/_src/python/dataset/dataset.py @@ -223,9 +223,10 @@ def range( Input arguments are interpreted the same way as in Python built-in ``range``: - - ``range(n)`` => start=0, stop=n, step=1 - - ``range(m, n)`` => start=m, stop=n, step=1 - - ``range(m, n, p)`` => start=m, stop=n, step=p + + - ``range(n)`` => start=0, stop=n, step=1 + - ``range(m, n)`` => start=m, stop=n, step=1 + - ``range(m, n, p)`` => start=m, stop=n, step=p The produced values are consistent with the built-in `range` function:: @@ -572,8 +573,9 @@ def seed(self, seed: int) -> MapDataset[T]: When default seed generation is enabled by calling ``ds.seed``, every downstream random transformation will be automatically seeded with a unique seed by default. This simplifies seed management, making it easier to avoid: - - Having to provide a seed in multiple transformations. - - Accidentally reusing the same seed across transformations. + + - Having to provide a seed in multiple transformations. + - Accidentally reusing the same seed across transformations. It is recommended to call this right after the source. ``ds.seed`` has to be called before any random transformations (such as ``shuffle`` or @@ -1079,8 +1081,9 @@ def seed(self, seed: int) -> IterDataset[T]: When default seed generation is enabled by calling ``ds.seed``, every downstream random transformation will be automatically seeded with a unique seed by default. This simplifies seed management, making it easier to avoid: - - Having to provide a seed in multiple transformations. - - Accidentally reusing the same seed across transformations. + + - Having to provide a seed in multiple transformations. + - Accidentally reusing the same seed across transformations. It is recommended to call this right after the source. ``ds.seed`` has to be called before any random transformations (such as ``random_map`` that rely diff --git a/grain/_src/python/dataset/transformations/packing_concat_then_split.py b/grain/_src/python/dataset/transformations/packing_concat_then_split.py index ac6f8fef1..f54e2a6c2 100644 --- a/grain/_src/python/dataset/transformations/packing_concat_then_split.py +++ b/grain/_src/python/dataset/transformations/packing_concat_then_split.py @@ -601,7 +601,7 @@ class ConcatThenSplitIterDataset(dataset.IterDataset): packed element. Positions indicate the position within the unpacked sequence. Features can be "meta features" in which case they are never split - and we do not create *_positions and *_segment_ids features for them. + and we do not create ``*_positions`` and ``*_segment_ids`` features for them. """ def __init__( @@ -623,8 +623,8 @@ def __init__( meta_features: Set of feature names that are considered meta features. Meta features are never split and will be duplicated when other features of the same element are split. Otherwise, meta features are packed - normally (they have their own sequence length). No *_positions and - *_segment_ids features are created for meta features. + normally (they have their own sequence length). No ``*_positions`` and + ``*_segment_ids`` features are created for meta features. split_full_length_features: Whether full-length features are split, or they are considered packed and passed through in priority. Setting split_full_length_features=False is an optimization when some sequences