From c6788354b23ae36786fb52f2fc4606d6c855c653 Mon Sep 17 00:00:00 2001 From: Clara Petrescu-Stompor Date: Thu, 1 Aug 2024 17:03:15 +0200 Subject: [PATCH 1/4] =?UTF-8?q?correction=20des=20erreurs=20des=20classes?= =?UTF-8?q?=20ajout=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- medkit/text/deid.py | 116 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/medkit/text/deid.py b/medkit/text/deid.py index 2ef04031..3ebb4d9b 100644 --- a/medkit/text/deid.py +++ b/medkit/text/deid.py @@ -1,6 +1,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Iterator +from typing import TYPE_CHECKING, ClassVar, Iterator, Optional + +from presidio_analyzer import Pattern, PatternRecognizer from medkit._import import import_optional from medkit.core.text import Entity, span_utils @@ -11,7 +13,7 @@ presidio_analyzer = import_optional("presidio_analyzer", extra="deid") -__all__ = ["PIIDetector"] +__all__ = ["PIIDetector", "FrDateRecognizer", "FrAgeRecognizer"] class PIIDetector(NEROperation): @@ -35,3 +37,113 @@ def _run_one(self, segment: Segment) -> Iterator[Entity]: spans=spans, metadata={"score": result.score}, ) + + +class FrDateRecognizer(PatternRecognizer): + """ + Recognizes French Date using regex. + + :param patterns: List of patterns to be used by this recognizer + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS: ClassVar[list[Pattern]] = [ + Pattern( + "French dates with day month year", + r"\b(?i)(\d{1,2}|1er) " + r"((janvier|janv.|JAN)|(février|févr.|FÉV)|(mars|MAR)|" + r"(avril|avr.|AVR)|(mai|MAI)|(juin|JUN)|(juillet|juill.|JUL)|" + r"(août|AOÛ)|(septembre|sept.|SEP)|(octobre|oct.|OCT)|" + r"(novembre|nov.|NOV)|(décembre|déc|DÉC)) \d{4}\b", + 0.9, + ), + Pattern( + "French dates with month year", + r"\b(?i)((janvier|janv.|JAN)|(février|févr.|FÉV)|(mars|MAR)|" + r"(avril|avr.|AVR)|(mai|MAI)|(juin|JUN)|(juillet|juill.|JUL)|" + r"(août|AOÛ)|(septembre|sept.|SEP)|(octobre|oct.|OCT)|" + r"(novembre|nov.|NOV)|(décembre|déc|DÉC)) \d{4}\b", + 0.9, + ), + Pattern( + "French dates with day month", + r"\b(?i)(\d{1,2} |1er )" + r"((janvier|janv.|JAN)|(février|févr.|FÉV)|(mars|MAR)|" + r"(avril|avr.|AVR)|(mai|MAI)|(juin|JUN)|(juillet|juill.|JUL)|" + r"(août|AOÛ)|(septembre|sept.|SEP)|(octobre|oct.|OCT)|" + r"(novembre|nov.|NOV)|(décembre|déc|DÉC))\b", + 0.9, + ), + Pattern( + "French dates with month", + r"\b(?i)((janvier|janv.|JAN)|(février|févr.|FÉV)|(mars|MAR)|" + r"(avril|avr.|AVR)|(mai|MAI)|(juin|JUN)|(juillet|juill.|JUL)|" + r"(août|AOÛ)|(septembre|sept.|SEP)|(octobre|oct.|OCT)|" + r"(novembre|nov.|NOV)|(décembre|déc|DÉC))\b", + 0.9, + ), + ] + + def __init__( + self, + patterns: list[Pattern] | None = None, + supported_language: str = "fr", + supported_entity: str = "FR_DATE", + ): + patterns = patterns if patterns else self.PATTERNS + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + supported_language=supported_language, + ) + + +class FrAgeRecognizer(PatternRecognizer): + """ + Recognizes French Phone Age using regex. + + :param patterns: List of patterns to be used by this recognizer + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS: ClassVar[list[Pattern]] = [ + Pattern( + "French age written with 'number (ans|mois)' " + "ou 'number et number (ans|mois)' " + "ou 'number à number (ans|mois)' " + "ou ' number ou number (ans|mois)'", + r"\d+\s+(ans|mois)|\d+\s+et\s+\d+\s+(ans|mois)|" r"\d+\s+à\s+\d+\s+(ans|mois)|\d+\s+ou\s+\d+\s+(ans|mois)", + 0.9, + ), + Pattern( + "French age written with 'number ans et number mois' " + "ou 'number ans et number ou number mois' " + " ou 'number ans et number à number mois' ", + r"\d+\s+ans\s+et\s+\d+\s+mois|\d+\s+ans\s+et\s+\d+\s+à\s+\d+\s+mois|" + r"\d+\s+ans\s+et\s+\d+\s+ou\s+\d+\s+mois", + 0.9, + ), + Pattern( + "French age written with letters '.. (ans|mois)' " + "ou ' .. et .. ans' ou ' .. à .. ans' ou ' .. ou .. ans' ", + r"((dix|vingt|trente|quarante|cinquante|soixante|soixante-dix|quatre-vingt|quatre-vingt-dix|cent)" + r"(\s+|-|-et-)?)?(un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze" + r"|quinze|seize)?\s+((et\s+|à\s+|ou\s+)\d+\s+)?(ans|mois)", + 0.9, + ), + ] + + def __init__( + self, + patterns: list[Pattern] | None = None, + supported_language: str = "fr", + supported_entity: str = "FR_AGE", + ): + patterns = patterns if patterns else self.PATTERNS + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + supported_language=supported_language, + ) From 8b5134b9b36fa91ac8deeea5ccc01a4aa33830a8 Mon Sep 17 00:00:00 2001 From: Clara Petrescu-Stompor Date: Thu, 1 Aug 2024 17:14:08 +0200 Subject: [PATCH 2/4] =?UTF-8?q?correction=20peut-=C3=AAtre=20derni=C3=A8re?= =?UTF-8?q?=20erreur?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- medkit/text/deid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medkit/text/deid.py b/medkit/text/deid.py index 3ebb4d9b..89ce5993 100644 --- a/medkit/text/deid.py +++ b/medkit/text/deid.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, ClassVar, Iterator, Optional +from typing import TYPE_CHECKING, ClassVar, Iterator from presidio_analyzer import Pattern, PatternRecognizer From 2c355275233f8660a36a60cd00a05064e1fbd94a Mon Sep 17 00:00:00 2001 From: cpetresc Date: Thu, 1 Aug 2024 17:47:15 +0200 Subject: [PATCH 3/4] Update medkit/text/deid.py Co-authored-by: Ghislain Vaillant --- medkit/text/deid.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/medkit/text/deid.py b/medkit/text/deid.py index 89ce5993..fc3769a7 100644 --- a/medkit/text/deid.py +++ b/medkit/text/deid.py @@ -41,11 +41,16 @@ def _run_one(self, segment: Segment) -> Iterator[Entity]: class FrDateRecognizer(PatternRecognizer): """ - Recognizes French Date using regex. - - :param patterns: List of patterns to be used by this recognizer - :param supported_language: Language this recognizer supports - :param supported_entity: The entity this recognizer can detect + Recognizer for French dates. + + Parameters + ---------- + patterns : list of Pattern, optional + User-defined patterns to be used by the recognizer + supported_language : str, default='fr' + Supported language declared by the recognizer + supported_entity : str, default='FR_DATE' + Supported entity detected by the recognizer """ PATTERNS: ClassVar[list[Pattern]] = [ From 5e753f18e96ccff68f7a542ec81542028bd0c384 Mon Sep 17 00:00:00 2001 From: Clara Petrescu-Stompor Date: Fri, 2 Aug 2024 10:40:44 +0200 Subject: [PATCH 4/4] change to docstring format in medkit --- medkit/text/deid.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/medkit/text/deid.py b/medkit/text/deid.py index fc3769a7..ad5ac51a 100644 --- a/medkit/text/deid.py +++ b/medkit/text/deid.py @@ -106,11 +106,16 @@ def __init__( class FrAgeRecognizer(PatternRecognizer): """ - Recognizes French Phone Age using regex. + Recognizer for French age. - :param patterns: List of patterns to be used by this recognizer - :param supported_language: Language this recognizer supports - :param supported_entity: The entity this recognizer can detect + Parameters + ---------- + patterns : list of Pattern, optional + User-defined patterns to be used by the recognizer + supported_language : str, default='fr' + Supported language declared by the recognizer + supported_entity : str, default='FR_DATE' + Supported entity detected by the recognizer """ PATTERNS: ClassVar[list[Pattern]] = [