From bd67cad64fd10eab5989414d0a759a5897b500da Mon Sep 17 00:00:00 2001 From: Adam Kamor Date: Thu, 26 Mar 2026 16:18:47 -0400 Subject: [PATCH 1/2] done --- docs/source/redact/api.rst | 48 ++- docs/source/redact/generator_metadata.rst | 273 ++++++++++++++++++ docs/source/redact/index.rst | 5 +- docs/source/redact/redact_config.rst | 2 + .../generator_metadata/age_shift_metadata.py | 11 + .../base_date_time_generator_metadata.py | 14 + .../generator_metadata/base_metadata.py | 19 ++ .../date_time_generator_metadata.py | 24 ++ .../email_generator_metadata.py | 13 + .../hipaa_address_generator_metadata.py | 21 ++ .../name_generator_metadata.py | 17 ++ .../numeric_value_generator_metadata.py | 12 + .../person_age_generator_metadata.py | 15 + .../phone_number_generator_metadata.py | 16 + .../timestamp_shift_metadata.py | 18 ++ 15 files changed, 505 insertions(+), 3 deletions(-) create mode 100644 docs/source/redact/generator_metadata.rst diff --git a/docs/source/redact/api.rst b/docs/source/redact/api.rst index 9f4d73f..60078cc 100644 --- a/docs/source/redact/api.rst +++ b/docs/source/redact/api.rst @@ -36,4 +36,50 @@ Helper classes :members: .. autoclass:: tonic_textual.helpers.json_conversation_helper.JsonConversationHelper - :members: + :members: + +Generator metadata +------------------------------------------------ +.. autoclass:: tonic_textual.classes.generator_metadata.base_metadata.BaseMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.base_date_time_generator_metadata.BaseDateTimeGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.name_generator_metadata.NameGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.email_generator_metadata.EmailGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.phone_number_generator_metadata.PhoneNumberGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.date_time_generator_metadata.DateTimeGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.timestamp_shift_metadata.TimestampShiftMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.person_age_generator_metadata.PersonAgeGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.hipaa_address_generator_metadata.HipaaAddressGeneratorMetadata + :members: + :no-undoc-members: + +.. autoclass:: tonic_textual.classes.generator_metadata.numeric_value_generator_metadata.NumericValueGeneratorMetadata + :members: + :no-undoc-members: diff --git a/docs/source/redact/generator_metadata.rst b/docs/source/redact/generator_metadata.rst new file mode 100644 index 0000000..c9bf555 --- /dev/null +++ b/docs/source/redact/generator_metadata.rst @@ -0,0 +1,273 @@ +.. _generator-metadata: + +Customizing synthesis with generator metadata +============================================== + +When you set an entity type to ``Synthesis`` via ``generator_config``, Textual uses default synthesis settings. The ``generator_metadata`` parameter lets you fine-tune how each entity type's synthesizer behaves. + +``generator_metadata`` is a dictionary that maps entity type names (such as ``"NAME_GIVEN"`` or ``"EMAIL_ADDRESS"``) to metadata instances that control synthesis behavior for that type. + +.. code-block:: python + + from tonic_textual.redact_api import TextualNer + from tonic_textual.classes.generator_metadata.name_generator_metadata import NameGeneratorMetadata + from tonic_textual.classes.generator_metadata.email_generator_metadata import EmailGeneratorMetadata + + textual = TextualNer() + + generator_metadata = { + "NAME_GIVEN": NameGeneratorMetadata(preserve_gender=True), + "NAME_FAMILY": NameGeneratorMetadata(is_consistency_case_sensitive=True), + "EMAIL_ADDRESS": EmailGeneratorMetadata(preserve_domain=True), + } + + result = textual.redact( + "Contact John Smith at john.smith@example.com", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + +.. note:: + + The ``redact_structured`` method takes a single ``Optional[BaseMetadata]`` rather than a dictionary, since it operates on a single entity type at a time. + +Common parameters +----------------- + +All metadata classes inherit from ``BaseMetadata`` and share the following parameter: + +* ``swaps`` (dict of str to str, default ``{}``) -- A dictionary of explicit replacement mappings. When a detected value matches a key, the corresponding value is used as the synthesized replacement instead of a generated one. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.name_generator_metadata import NameGeneratorMetadata + + # Always replace "Acme" with "Globex" instead of generating a random name + metadata = NameGeneratorMetadata(swaps={"Acme": "Globex"}) + + +Name synthesis +-------------- + +:class:`~tonic_textual.classes.generator_metadata.name_generator_metadata.NameGeneratorMetadata` controls how synthesized names are generated. Use it with the ``NAME_GIVEN`` and ``NAME_FAMILY`` entity types. + +* ``is_consistency_case_sensitive`` (bool, default ``False``) -- When ``True``, name consistency is case-sensitive. ``"john"`` and ``"John"`` are treated as different names and may receive different replacements. +* ``preserve_gender`` (bool, default ``False``) -- When ``True``, the synthesized name preserves the gender of the original. Male names are replaced with male names, and female names with female names. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.name_generator_metadata import NameGeneratorMetadata + + generator_metadata = { + "NAME_GIVEN": NameGeneratorMetadata(preserve_gender=True), + } + + result = textual.redact( + "John told Mary about the project.", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + + +Email synthesis +--------------- + +:class:`~tonic_textual.classes.generator_metadata.email_generator_metadata.EmailGeneratorMetadata` controls how synthesized email addresses are generated. Use it with the ``EMAIL_ADDRESS`` entity type. + +* ``preserve_domain`` (bool, default ``False``) -- When ``True``, the domain portion of the email address is preserved. For example, ``"john@example.com"`` might become ``"alan@example.com"``. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.email_generator_metadata import EmailGeneratorMetadata + + generator_metadata = { + "EMAIL_ADDRESS": EmailGeneratorMetadata(preserve_domain=True), + } + + result = textual.redact( + "Reach me at john@example.com", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + + +Phone number synthesis +---------------------- + +:class:`~tonic_textual.classes.generator_metadata.phone_number_generator_metadata.PhoneNumberGeneratorMetadata` controls how synthesized phone numbers are generated. Use it with the ``PHONE_NUMBER`` entity type. + +* ``use_us_phone_number_generator`` (bool, default ``False``) -- When ``True``, generated phone numbers use a US phone number format. +* ``replace_invalid_numbers`` (bool, default ``True``) -- When ``True``, detected phone numbers that are not valid are still replaced with synthesized values. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.phone_number_generator_metadata import PhoneNumberGeneratorMetadata + + generator_metadata = { + "PHONE_NUMBER": PhoneNumberGeneratorMetadata( + use_us_phone_number_generator=True, + replace_invalid_numbers=True, + ), + } + + result = textual.redact( + "Call me at 555-0123.", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + + +Date and time synthesis +----------------------- + +:class:`~tonic_textual.classes.generator_metadata.date_time_generator_metadata.DateTimeGeneratorMetadata` controls how synthesized dates and times are generated. Use it with the ``DATE_TIME`` entity type. Dates are shifted by a random number of days within a configurable range. + +* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse into a standard format are still scrambled. +* ``additional_date_formats`` (list of str, default ``[]``) -- Additional date format patterns that Textual should recognize. Uses Python ``strftime``/``strptime`` format codes. +* ``apply_constant_shift_to_document`` (bool, default ``False``) -- When ``True``, all dates within the same document are shifted by the same random offset, preserving relative time differences between dates. +* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.timestamp_shift_metadata.TimestampShiftMetadata`) -- Controls the date shift range. Default shifts dates by -7 to +7 days. + +TimestampShiftMetadata +^^^^^^^^^^^^^^^^^^^^^^ + +:class:`~tonic_textual.classes.generator_metadata.timestamp_shift_metadata.TimestampShiftMetadata` configures the range of days by which dates can be shifted. + +* ``left_shift_in_days`` (int, default ``-7``) -- The minimum shift in days. Use a negative value to shift dates into the past. +* ``right_shift_in_days`` (int, default ``7``) -- The maximum shift in days. Use a positive value to shift dates into the future. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.date_time_generator_metadata import DateTimeGeneratorMetadata + from tonic_textual.classes.generator_metadata.timestamp_shift_metadata import TimestampShiftMetadata + + generator_metadata = { + "DATE_TIME": DateTimeGeneratorMetadata( + apply_constant_shift_to_document=True, + metadata=TimestampShiftMetadata( + left_shift_in_days=-30, + right_shift_in_days=30, + ), + ), + } + + result = textual.redact( + "The meeting is on 2024-01-15 and the deadline is 2024-02-01.", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + + +Person age synthesis +-------------------- + +:class:`~tonic_textual.classes.generator_metadata.person_age_generator_metadata.PersonAgeGeneratorMetadata` controls how synthesized ages are generated. Use it with the ``PERSON_AGE`` entity type. + +* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse are still scrambled. +* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata`) -- Controls the age shift amount. Default shifts ages by 7 years. + +AgeShiftMetadata +^^^^^^^^^^^^^^^^ + +:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata` configures how many years to shift detected ages. + +* ``age_shift_in_years`` (int, default ``7``) -- The number of years to shift the age. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.person_age_generator_metadata import PersonAgeGeneratorMetadata + from tonic_textual.classes.generator_metadata.age_shift_metadata import AgeShiftMetadata + + generator_metadata = { + "PERSON_AGE": PersonAgeGeneratorMetadata( + metadata=AgeShiftMetadata(age_shift_in_years=3), + ), + } + + result = textual.redact( + "The patient is 45 years old.", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + + +Address synthesis (HIPAA) +------------------------- + +:class:`~tonic_textual.classes.generator_metadata.hipaa_address_generator_metadata.HipaaAddressGeneratorMetadata` controls how synthesized addresses are generated for location entity types such as ``LOCATION_ADDRESS`` and ``LOCATION_ZIP``. By default, address synthesis follows HIPAA Safe Harbor de-identification rules. + +* ``use_non_hipaa_address_generator`` (bool, default ``False``) -- When ``True``, uses a non-HIPAA-compliant address generator that may produce more realistic addresses but does not guarantee HIPAA Safe Harbor compliance. +* ``replace_truncated_zeros_in_zip_code`` (bool, default ``True``) -- When ``True``, ZIP codes that have been truncated to three digits (per HIPAA Safe Harbor) have the removed digits replaced with zeros. +* ``realistic_synthetic_values`` (bool, default ``True``) -- When ``True``, generates realistic-looking synthetic address values. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.hipaa_address_generator_metadata import HipaaAddressGeneratorMetadata + + generator_metadata = { + "LOCATION_ADDRESS": HipaaAddressGeneratorMetadata( + realistic_synthetic_values=True, + replace_truncated_zeros_in_zip_code=True, + ), + } + + result = textual.redact( + "She lives at 123 Main St, Springfield, IL 62704.", + generator_default="Synthesis", + generator_metadata=generator_metadata, + ) + + +Numeric value synthesis +----------------------- + +:class:`~tonic_textual.classes.generator_metadata.numeric_value_generator_metadata.NumericValueGeneratorMetadata` controls how synthesized numeric values are generated. Use it with the ``NUMERIC_VALUE`` entity type. + +* ``use_oracle_integer_pk_generator`` (bool, default ``False``) -- When ``True``, uses a generator designed for Oracle integer primary keys. + +.. code-block:: python + + from tonic_textual.classes.generator_metadata.numeric_value_generator_metadata import NumericValueGeneratorMetadata + + generator_metadata = { + "NUMERIC_VALUE": NumericValueGeneratorMetadata( + use_oracle_integer_pk_generator=True, + ), + } + + +Combining multiple metadata configurations +------------------------------------------- + +You can combine multiple metadata configurations in a single call. This example configures synthesis for names, emails, and dates at once: + +.. code-block:: python + + from tonic_textual.redact_api import TextualNer + from tonic_textual.classes.generator_metadata.name_generator_metadata import NameGeneratorMetadata + from tonic_textual.classes.generator_metadata.email_generator_metadata import EmailGeneratorMetadata + from tonic_textual.classes.generator_metadata.date_time_generator_metadata import DateTimeGeneratorMetadata + from tonic_textual.classes.generator_metadata.timestamp_shift_metadata import TimestampShiftMetadata + + textual = TextualNer() + + result = textual.redact( + "John Smith (john@acme.com) joined on 2024-01-15.", + generator_default="Off", + generator_config={ + "NAME_GIVEN": "Synthesis", + "NAME_FAMILY": "Synthesis", + "EMAIL_ADDRESS": "Synthesis", + "DATE_TIME": "Synthesis", + }, + generator_metadata={ + "NAME_GIVEN": NameGeneratorMetadata(preserve_gender=True), + "EMAIL_ADDRESS": EmailGeneratorMetadata(preserve_domain=True), + "DATE_TIME": DateTimeGeneratorMetadata( + apply_constant_shift_to_document=True, + metadata=TimestampShiftMetadata( + left_shift_in_days=-14, + right_shift_in_days=14, + ), + ), + }, + ) diff --git a/docs/source/redact/index.rst b/docs/source/redact/index.rst index 27770ab..2f8b983 100644 --- a/docs/source/redact/index.rst +++ b/docs/source/redact/index.rst @@ -13,13 +13,14 @@ When Textual operates on your data: 2. Second, it uses information about where entities are located to tokenize or synthesize the data. -In :doc:`Choosing tokenization or synthesis <./redact_config>` you can learn different ways to configure your output. +In :doc:`Choosing tokenization or synthesis <./redact_config>` you can learn different ways to configure your output. To fine-tune how synthesized values are generated for specific entity types, see :doc:`Customizing synthesis with generator metadata <./generator_metadata>`. .. toctree:: :caption: In this section: - + redact_config + generator_metadata redacting_text redacting_json redacting_html diff --git a/docs/source/redact/redact_config.rst b/docs/source/redact/redact_config.rst index 658b601..9398f56 100644 --- a/docs/source/redact/redact_config.rst +++ b/docs/source/redact/redact_config.rst @@ -35,6 +35,8 @@ Synthesized entities are replaced with realistic fake values, For example:: These fake values are consistent. So in the above example, John goes to Alan and will do so in all cases within the document and optionally across documents as well. +To further customize how synthesized values are generated for specific entity types, see :ref:`generator-metadata`. + Group synthesis ^^^^^^^^^^^^^^^^^^ diff --git a/tonic_textual/classes/generator_metadata/age_shift_metadata.py b/tonic_textual/classes/generator_metadata/age_shift_metadata.py index 9898bb1..3bdcb1d 100644 --- a/tonic_textual/classes/generator_metadata/age_shift_metadata.py +++ b/tonic_textual/classes/generator_metadata/age_shift_metadata.py @@ -2,6 +2,17 @@ class AgeShiftMetadata(dict): + """Configuration for the age shift amount used by + :class:`PersonAgeGeneratorMetadata`. + + Defines how many years to shift detected ages by. + + Parameters + ---------- + age_shift_in_years : int + The number of years to shift the age. Default is ``7``. + """ + def __init__( self, age_shift_in_years: int = 7 diff --git a/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py b/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py index 3b3e5a1..b9a4ad4 100644 --- a/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py @@ -6,6 +6,20 @@ class BaseDateTimeGeneratorMetadata(BaseMetadata): + """Base class for date and time related generator metadata. + + Extends :class:`BaseMetadata` with a common date/time parameter. You + typically do not instantiate this class directly. Instead, use + :class:`DateTimeGeneratorMetadata` or :class:`PersonAgeGeneratorMetadata`. + + Parameters + ---------- + scramble_unrecognized_dates : bool + When ``True``, dates that Textual cannot parse into a standard format + are still scrambled. When ``False``, unrecognized dates are left + unchanged. Default is ``True``. + """ + def __init__( self, custom_generator: Optional[GeneratorType] = None, diff --git a/tonic_textual/classes/generator_metadata/base_metadata.py b/tonic_textual/classes/generator_metadata/base_metadata.py index f133201..c33d73b 100644 --- a/tonic_textual/classes/generator_metadata/base_metadata.py +++ b/tonic_textual/classes/generator_metadata/base_metadata.py @@ -5,6 +5,25 @@ class BaseMetadata(dict): + """Base class for all generator metadata configurations. + + Provides common parameters shared by all metadata types. You typically + do not instantiate this class directly. Instead, use a specific metadata + subclass such as :class:`NameGeneratorMetadata` or + :class:`EmailGeneratorMetadata`. + + Parameters + ---------- + custom_generator : GeneratorType, optional + The generator type. Set automatically by subclasses. + generator_version : GeneratorVersion + The generator version to use. Default is ``V1``. + swaps : dict of str to str, optional + A dictionary of explicit replacement mappings. When a detected value + matches a key in the dictionary, the corresponding value is used as + the synthesized replacement instead of a generated one. + """ + def __init__( self, custom_generator: Optional[GeneratorType] = None, diff --git a/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py b/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py index 59f169a..df499aa 100644 --- a/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py @@ -7,6 +7,30 @@ class DateTimeGeneratorMetadata(BaseDateTimeGeneratorMetadata): + """Metadata configuration for date and time synthesis. + + Controls how synthesized date and time values are generated for the + ``DATE_TIME`` entity type. Dates are shifted by a random number of days + within a configurable range. + + Parameters + ---------- + scramble_unrecognized_dates : bool + When ``True``, dates that Textual cannot parse into a standard + format are still scrambled. Default is ``True``. + additional_date_formats : list of str + A list of additional date format patterns that Textual should + recognize. Use Python ``strftime``/``strptime`` format codes. + Default is an empty list. + apply_constant_shift_to_document : bool + When ``True``, all dates within the same document are shifted by + the same random offset, preserving relative time differences + between dates. Default is ``False``. + metadata : TimestampShiftMetadata + Configuration for the date shift range. Default shifts dates by + -7 to +7 days. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/email_generator_metadata.py b/tonic_textual/classes/generator_metadata/email_generator_metadata.py index 78ce063..53b917e 100644 --- a/tonic_textual/classes/generator_metadata/email_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/email_generator_metadata.py @@ -6,6 +6,19 @@ class EmailGeneratorMetadata(BaseMetadata): + """Metadata configuration for email address synthesis. + + Controls how synthesized email addresses are generated for the + ``EMAIL_ADDRESS`` entity type. + + Parameters + ---------- + preserve_domain : bool + When ``True``, the domain portion of the email address is kept + intact. For example, ``"john@example.com"`` might become + ``"alan@example.com"``. Default is ``False``. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py b/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py index 77b5ec5..2388a1f 100644 --- a/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py @@ -6,6 +6,27 @@ class HipaaAddressGeneratorMetadata(BaseMetadata): + """Metadata configuration for HIPAA-compliant address synthesis. + + Controls how synthesized addresses are generated for location entity + types such as ``LOCATION_ADDRESS`` and ``LOCATION_ZIP``. By default, + address synthesis follows HIPAA Safe Harbor de-identification rules. + + Parameters + ---------- + use_non_hipaa_address_generator : bool + When ``True``, uses a non-HIPAA-compliant address generator that + may produce more realistic addresses but does not guarantee HIPAA + Safe Harbor compliance. Default is ``False``. + replace_truncated_zeros_in_zip_code : bool + When ``True``, ZIP codes that have been truncated to three digits + (per HIPAA Safe Harbor) have the removed digits replaced with + zeros. Default is ``True``. + realistic_synthetic_values : bool + When ``True``, generates realistic-looking synthetic address values. + Default is ``True``. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/name_generator_metadata.py b/tonic_textual/classes/generator_metadata/name_generator_metadata.py index a230f99..ade9179 100644 --- a/tonic_textual/classes/generator_metadata/name_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/name_generator_metadata.py @@ -6,6 +6,23 @@ class NameGeneratorMetadata(BaseMetadata): + """Metadata configuration for name synthesis. + + Controls how synthesized names are generated for entity types such as + ``NAME_GIVEN`` and ``NAME_FAMILY``. + + Parameters + ---------- + is_consistency_case_sensitive : bool + When ``True``, name consistency is case-sensitive. For example, + ``"john"`` and ``"John"`` are treated as different names and may + receive different replacements. Default is ``False``. + preserve_gender : bool + When ``True``, the synthesized name preserves the gender of the + original name. Male names are replaced with male names, and female + names are replaced with female names. Default is ``False``. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/numeric_value_generator_metadata.py b/tonic_textual/classes/generator_metadata/numeric_value_generator_metadata.py index a7fe189..b3221ad 100644 --- a/tonic_textual/classes/generator_metadata/numeric_value_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/numeric_value_generator_metadata.py @@ -6,6 +6,18 @@ class NumericValueGeneratorMetadata(BaseMetadata): + """Metadata configuration for numeric value synthesis. + + Controls how synthesized numeric values are generated for the + ``NUMERIC_VALUE`` entity type. + + Parameters + ---------- + use_oracle_integer_pk_generator : bool + When ``True``, uses a generator designed for Oracle integer primary + keys. Default is ``False``. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py b/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py index 37cd026..867e799 100644 --- a/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py @@ -7,6 +7,21 @@ class PersonAgeGeneratorMetadata(BaseDateTimeGeneratorMetadata): + """Metadata configuration for person age synthesis. + + Controls how synthesized ages are generated for the ``PERSON_AGE`` + entity type. Ages are shifted by a configurable number of years. + + Parameters + ---------- + scramble_unrecognized_dates : bool + When ``True``, dates that Textual cannot parse into a standard + format are still scrambled. Default is ``True``. + metadata : AgeShiftMetadata + Configuration for the age shift amount. Default shifts ages by + 7 years. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py b/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py index c25fe49..d0e35fe 100644 --- a/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py @@ -6,6 +6,22 @@ class PhoneNumberGeneratorMetadata(BaseMetadata): + """Metadata configuration for phone number synthesis. + + Controls how synthesized phone numbers are generated for the + ``PHONE_NUMBER`` entity type. + + Parameters + ---------- + use_us_phone_number_generator : bool + When ``True``, generated phone numbers use a US phone number format. + Default is ``False``. + replace_invalid_numbers : bool + When ``True``, phone numbers that are detected but are not valid + phone numbers are still replaced with synthesized values. Default + is ``True``. + """ + def __init__( self, generator_version: GeneratorVersion = GeneratorVersion.V1, diff --git a/tonic_textual/classes/generator_metadata/timestamp_shift_metadata.py b/tonic_textual/classes/generator_metadata/timestamp_shift_metadata.py index e34461a..c241a9f 100644 --- a/tonic_textual/classes/generator_metadata/timestamp_shift_metadata.py +++ b/tonic_textual/classes/generator_metadata/timestamp_shift_metadata.py @@ -4,6 +4,24 @@ class TimestampShiftMetadata(BaseMetadata): + """Configuration for the date shift range used by + :class:`DateTimeGeneratorMetadata`. + + Defines the range of days by which dates can be shifted. The actual + shift for each date is randomly chosen within the specified range. + + Parameters + ---------- + left_shift_in_days : int, optional + The minimum (leftmost) shift in days. Use a negative value to shift + dates into the past. Default is ``-7``. + right_shift_in_days : int, optional + The maximum (rightmost) shift in days. Use a positive value to shift + dates into the future. Default is ``7``. + time_stamp_shift_in_days : int, optional + Deprecated. Use ``left_shift_in_days`` and ``right_shift_in_days`` + instead. + """ def __init__( self, From e4d2762e8111c64f9a6480e5e154153b3932b830 Mon Sep 17 00:00:00 2001 From: Adam Kamor <9391841+akamor@users.noreply.github.com> Date: Thu, 26 Mar 2026 18:39:13 -0400 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Janice Manwiller <107077736+JaniceManwiller@users.noreply.github.com> --- docs/source/redact/generator_metadata.rst | 30 +++++++++---------- docs/source/redact/redact_config.rst | 2 +- .../base_date_time_generator_metadata.py | 2 +- .../date_time_generator_metadata.py | 6 ++-- .../hipaa_address_generator_metadata.py | 6 ++-- .../name_generator_metadata.py | 2 +- .../person_age_generator_metadata.py | 4 +-- .../phone_number_generator_metadata.py | 6 ++-- 8 files changed, 29 insertions(+), 29 deletions(-) diff --git a/docs/source/redact/generator_metadata.rst b/docs/source/redact/generator_metadata.rst index c9bf555..3dfc3bf 100644 --- a/docs/source/redact/generator_metadata.rst +++ b/docs/source/redact/generator_metadata.rst @@ -3,7 +3,7 @@ Customizing synthesis with generator metadata ============================================== -When you set an entity type to ``Synthesis`` via ``generator_config``, Textual uses default synthesis settings. The ``generator_metadata`` parameter lets you fine-tune how each entity type's synthesizer behaves. +When you use ``generator_config`` to set an entity type to ``Synthesis``, Textual uses default synthesis settings. The ``generator_metadata`` parameter allows you to fine-tune how each entity type's synthesizer behaves. ``generator_metadata`` is a dictionary that maps entity type names (such as ``"NAME_GIVEN"`` or ``"EMAIL_ADDRESS"``) to metadata instances that control synthesis behavior for that type. @@ -29,7 +29,7 @@ When you set an entity type to ``Synthesis`` via ``generator_config``, Textual u .. note:: - The ``redact_structured`` method takes a single ``Optional[BaseMetadata]`` rather than a dictionary, since it operates on a single entity type at a time. + The ``redact_structured`` method takes a single ``Optional[BaseMetadata]`` instead of a dictionary, because it operates on a single entity type at a time. Common parameters ----------------- @@ -51,7 +51,7 @@ Name synthesis :class:`~tonic_textual.classes.generator_metadata.name_generator_metadata.NameGeneratorMetadata` controls how synthesized names are generated. Use it with the ``NAME_GIVEN`` and ``NAME_FAMILY`` entity types. -* ``is_consistency_case_sensitive`` (bool, default ``False``) -- When ``True``, name consistency is case-sensitive. ``"john"`` and ``"John"`` are treated as different names and may receive different replacements. +* ``is_consistency_case_sensitive`` (bool, default ``False``) -- When ``True``, name consistency is case-sensitive. ``"john"`` and ``"John"`` are treated as different names and might receive different replacements. * ``preserve_gender`` (bool, default ``False``) -- When ``True``, the synthesized name preserves the gender of the original. Male names are replaced with male names, and female names with female names. .. code-block:: python @@ -94,10 +94,10 @@ Email synthesis Phone number synthesis ---------------------- -:class:`~tonic_textual.classes.generator_metadata.phone_number_generator_metadata.PhoneNumberGeneratorMetadata` controls how synthesized phone numbers are generated. Use it with the ``PHONE_NUMBER`` entity type. +:class:`~tonic_textual.classes.generator_metadata.phone_number_generator_metadata.PhoneNumberGeneratorMetadata` controls how synthesized telephone numbers are generated. Use it with the ``PHONE_NUMBER`` entity type. -* ``use_us_phone_number_generator`` (bool, default ``False``) -- When ``True``, generated phone numbers use a US phone number format. -* ``replace_invalid_numbers`` (bool, default ``True``) -- When ``True``, detected phone numbers that are not valid are still replaced with synthesized values. +* ``use_us_phone_number_generator`` (bool, default ``False``) -- When ``True``, generated telephone numbers use a US phone number format. +* ``replace_invalid_numbers`` (bool, default ``True``) -- When ``True``, detected telephone numbers that are not valid are still replaced with synthesized values. .. code-block:: python @@ -122,10 +122,10 @@ Date and time synthesis :class:`~tonic_textual.classes.generator_metadata.date_time_generator_metadata.DateTimeGeneratorMetadata` controls how synthesized dates and times are generated. Use it with the ``DATE_TIME`` entity type. Dates are shifted by a random number of days within a configurable range. -* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse into a standard format are still scrambled. +* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse into a standard format are scrambled. * ``additional_date_formats`` (list of str, default ``[]``) -- Additional date format patterns that Textual should recognize. Uses Python ``strftime``/``strptime`` format codes. -* ``apply_constant_shift_to_document`` (bool, default ``False``) -- When ``True``, all dates within the same document are shifted by the same random offset, preserving relative time differences between dates. -* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.timestamp_shift_metadata.TimestampShiftMetadata`) -- Controls the date shift range. Default shifts dates by -7 to +7 days. +* ``apply_constant_shift_to_document`` (bool, default ``False``) -- When ``True``, all dates within the same document are shifted by the same random offset. This preserves the relative time differences between dates. +* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.timestamp_shift_metadata.TimestampShiftMetadata`) -- Controls the date shift range. By default, dates shift by -7 to +7 days. TimestampShiftMetadata ^^^^^^^^^^^^^^^^^^^^^^ @@ -162,13 +162,13 @@ Person age synthesis :class:`~tonic_textual.classes.generator_metadata.person_age_generator_metadata.PersonAgeGeneratorMetadata` controls how synthesized ages are generated. Use it with the ``PERSON_AGE`` entity type. -* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse are still scrambled. -* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata`) -- Controls the age shift amount. Default shifts ages by 7 years. +* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse are scrambled. +* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata`) -- Controls the age shift amount. By default, ages shift by 7 years. AgeShiftMetadata ^^^^^^^^^^^^^^^^ -:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata` configures how many years to shift detected ages. +:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata` configures the number of years to shift detected ages. * ``age_shift_in_years`` (int, default ``7``) -- The number of years to shift the age. @@ -195,8 +195,8 @@ Address synthesis (HIPAA) :class:`~tonic_textual.classes.generator_metadata.hipaa_address_generator_metadata.HipaaAddressGeneratorMetadata` controls how synthesized addresses are generated for location entity types such as ``LOCATION_ADDRESS`` and ``LOCATION_ZIP``. By default, address synthesis follows HIPAA Safe Harbor de-identification rules. -* ``use_non_hipaa_address_generator`` (bool, default ``False``) -- When ``True``, uses a non-HIPAA-compliant address generator that may produce more realistic addresses but does not guarantee HIPAA Safe Harbor compliance. -* ``replace_truncated_zeros_in_zip_code`` (bool, default ``True``) -- When ``True``, ZIP codes that have been truncated to three digits (per HIPAA Safe Harbor) have the removed digits replaced with zeros. +* ``use_non_hipaa_address_generator`` (bool, default ``False``) -- When ``True``, uses a non-HIPAA-compliant address generator that might produce more realistic addresses, but does not guarantee HIPAA Safe Harbor compliance. +* ``replace_truncated_zeros_in_zip_code`` (bool, default ``True``) -- When ``True``, for ZIP codes that are truncated to three digits (per HIPAA Safe Harbor), the removed digits are replaced with zeros. * ``realistic_synthetic_values`` (bool, default ``True``) -- When ``True``, generates realistic-looking synthetic address values. .. code-block:: python @@ -238,7 +238,7 @@ Numeric value synthesis Combining multiple metadata configurations ------------------------------------------- -You can combine multiple metadata configurations in a single call. This example configures synthesis for names, emails, and dates at once: +You can combine multiple metadata configurations in a single call. This example configures synthesis for names, emails, and dates: .. code-block:: python diff --git a/docs/source/redact/redact_config.rst b/docs/source/redact/redact_config.rst index 9398f56..4dc7892 100644 --- a/docs/source/redact/redact_config.rst +++ b/docs/source/redact/redact_config.rst @@ -33,7 +33,7 @@ Synthesized entities are replaced with realistic fake values, For example:: My name is John Smith. -> My name is Alan Johnson -These fake values are consistent. So in the above example, John goes to Alan and will do so in all cases within the document and optionally across documents as well. +These fake values are consistent. So in the above example, John changed to Alan and does so in all cases within the document and optionally across documents as well. To further customize how synthesized values are generated for specific entity types, see :ref:`generator-metadata`. diff --git a/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py b/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py index b9a4ad4..bd65fdf 100644 --- a/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/base_date_time_generator_metadata.py @@ -16,7 +16,7 @@ class BaseDateTimeGeneratorMetadata(BaseMetadata): ---------- scramble_unrecognized_dates : bool When ``True``, dates that Textual cannot parse into a standard format - are still scrambled. When ``False``, unrecognized dates are left + are scrambled. When ``False``, unrecognized dates are left unchanged. Default is ``True``. """ diff --git a/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py b/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py index df499aa..f05b8cb 100644 --- a/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/date_time_generator_metadata.py @@ -17,17 +17,17 @@ class DateTimeGeneratorMetadata(BaseDateTimeGeneratorMetadata): ---------- scramble_unrecognized_dates : bool When ``True``, dates that Textual cannot parse into a standard - format are still scrambled. Default is ``True``. + format are scrambled. Default is ``True``. additional_date_formats : list of str A list of additional date format patterns that Textual should recognize. Use Python ``strftime``/``strptime`` format codes. Default is an empty list. apply_constant_shift_to_document : bool When ``True``, all dates within the same document are shifted by - the same random offset, preserving relative time differences + the same random offset. This preserves relative time differences between dates. Default is ``False``. metadata : TimestampShiftMetadata - Configuration for the date shift range. Default shifts dates by + Configuration for the date shift range. By default dates shift by -7 to +7 days. """ diff --git a/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py b/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py index 2388a1f..b5aa762 100644 --- a/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/hipaa_address_generator_metadata.py @@ -16,11 +16,11 @@ class HipaaAddressGeneratorMetadata(BaseMetadata): ---------- use_non_hipaa_address_generator : bool When ``True``, uses a non-HIPAA-compliant address generator that - may produce more realistic addresses but does not guarantee HIPAA + may produce more realistic addresses, but does not guarantee HIPAA Safe Harbor compliance. Default is ``False``. replace_truncated_zeros_in_zip_code : bool - When ``True``, ZIP codes that have been truncated to three digits - (per HIPAA Safe Harbor) have the removed digits replaced with + When ``True``, for ZIP codes that are truncated to three digits + (per HIPAA Safe Harbor), the removed digits are replaced with zeros. Default is ``True``. realistic_synthetic_values : bool When ``True``, generates realistic-looking synthetic address values. diff --git a/tonic_textual/classes/generator_metadata/name_generator_metadata.py b/tonic_textual/classes/generator_metadata/name_generator_metadata.py index ade9179..5a04422 100644 --- a/tonic_textual/classes/generator_metadata/name_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/name_generator_metadata.py @@ -15,7 +15,7 @@ class NameGeneratorMetadata(BaseMetadata): ---------- is_consistency_case_sensitive : bool When ``True``, name consistency is case-sensitive. For example, - ``"john"`` and ``"John"`` are treated as different names and may + ``"john"`` and ``"John"`` are treated as different names and might receive different replacements. Default is ``False``. preserve_gender : bool When ``True``, the synthesized name preserves the gender of the diff --git a/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py b/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py index 867e799..0bcc1f7 100644 --- a/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/person_age_generator_metadata.py @@ -16,9 +16,9 @@ class PersonAgeGeneratorMetadata(BaseDateTimeGeneratorMetadata): ---------- scramble_unrecognized_dates : bool When ``True``, dates that Textual cannot parse into a standard - format are still scrambled. Default is ``True``. + format are scrambled. Default is ``True``. metadata : AgeShiftMetadata - Configuration for the age shift amount. Default shifts ages by + Configuration for the age shift amount. By default, ages shift by 7 years. """ diff --git a/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py b/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py index d0e35fe..51ab56a 100644 --- a/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py +++ b/tonic_textual/classes/generator_metadata/phone_number_generator_metadata.py @@ -8,17 +8,17 @@ class PhoneNumberGeneratorMetadata(BaseMetadata): """Metadata configuration for phone number synthesis. - Controls how synthesized phone numbers are generated for the + Controls how synthesized telephone numbers are generated for the ``PHONE_NUMBER`` entity type. Parameters ---------- use_us_phone_number_generator : bool - When ``True``, generated phone numbers use a US phone number format. + When ``True``, generated telephone numbers use a US phone number format. Default is ``False``. replace_invalid_numbers : bool When ``True``, phone numbers that are detected but are not valid - phone numbers are still replaced with synthesized values. Default + phone numbers are replaced with synthesized values. Default is ``True``. """