From e7a0c31aab9fa2b2701b822e1bd2123237737ce8 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 15 Apr 2026 14:32:08 +0100 Subject: [PATCH 01/25] stack based html tag validation --- app/validators/questionnaire_validator.py | 83 +++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 27626c94..05152808 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -6,6 +6,7 @@ """ import re +from eq_translations.survey_schema import SurveySchema from collections.abc import Mapping from app import error_messages @@ -70,6 +71,7 @@ def validate(self): self.validate_duplicates() self.validate_smart_quotes() self.validate_white_spaces() + self.validate_html() self.validate_answer_references() self.validate_list_references() @@ -166,6 +168,87 @@ def validate_smart_quotes(self): error_messages.DUMB_QUOTES_FOUND, pointer=translatable_item.pointer, ) + def validate_html(self): + # loop over translatable strings + # call check_html_tags(text, pointer) + + schema_object = SurveySchema(self.schema_element) + + for translatable_item in schema_object.translatable_items: + schema_text = translatable_item.value + values_to_check = [schema_text] + + if isinstance(schema_text, dict): + values_to_check = schema_text.values() + + for text in values_to_check: + if isinstance(text, str) and text and "<" in text and ">" in text: + self.check_html_tags(text, translatable_item.pointer) + + return + + def check_html_tags(self, text, pointer): + """Checks valid html tags. + + Args: + text (str): The text to be validated for HTML tags. + pointer (str): The JSON pointer indicating the location of the text in the questionnaire schema, used for + error reporting. + """ + + allowed_tags = {"p", "strong", "a", "br", "em", "h1"} + self_closing_tags = {"br"} + + tag_matches = re.finditer(r"]*>", text) + stack = [] + + for match in tag_matches: #for each HTML tag found in the text + raw_tag = match.group(0) + tag_name = match.group(1).lower() + + is_closing = raw_tag.startswith("") or tag_name in self_closing_tags + + if tag_name not in allowed_tags: # invalid html tag found + self.add_error( + error_messages.HTML_FOUND, + pointer=pointer, + text=text, + ) + return + + if is_closing: #closed tag found, pop + if tag_name in self_closing_tags or not stack or stack[-1] != tag_name: + self.add_error( + error_messages.HTML_FOUND, + pointer=pointer, + text=text, + ) + return + + stack.pop() + + elif not is_self_closing:#open tag not void elem + stack.append(tag_name) + + if stack: + self.add_error( + error_messages.HTML_FOUND, + pointer=pointer, + text=text, + ) + + def validate_html_entities(self, text, pointer): + """Validate that there are no HTML entities in the translatable text fields of the questionnaire schema. Uses a + regular expression to search for occurrences of HTML entities in the text. + + Args: + text (str): The text to be validated for HTML entities. + pointer (str): The JSON pointer indicating the location of the text in the questionnaire schema, used for + error reporting. + """ + return + def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text From af6fd1335e3f6fcafc351ef232d547344a24e5ca Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 15 Apr 2026 14:32:22 +0100 Subject: [PATCH 02/25] error msg --- app/error_messages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/error_messages.py b/app/error_messages.py index 188a9947..1dd79854 100644 --- a/app/error_messages.py +++ b/app/error_messages.py @@ -1,6 +1,8 @@ """Error messages used in validators and exceptions throughout the validator codebase.""" DUMB_QUOTES_FOUND = "Found dumb quotes(s) in schema text" +HTML_FOUND = "Found invalid HTML tag(s) in schema text" +HTML_ENTITIES_FOUND = "Found invalid HTML entity(ies) in schema text" INVALID_WHITESPACE_FOUND = "Found invalid white space(s) in schema text" DUPLICATE_ID_FOUND = "Duplicate id found" FOR_LIST_NEVER_POPULATED = "for_list is not populated by any ListCollector blocks or supplementary data sources" From a00a83dc678d2eb2dbe771825e086a0c52a1b166 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 15 Apr 2026 14:32:49 +0100 Subject: [PATCH 03/25] invalid html tag test --- .../invalid/test_invalid_html_tags.json | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 tests/schemas/invalid/test_invalid_html_tags.json diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json new file mode 100644 index 00000000..ac03a8fc --- /dev/null +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -0,0 +1,133 @@ +{ + "mime_type": "application/json/ons/eq", + "language": "en", + "schema_version": "0.0.1", + "data_version": "0.0.3", + "survey_id": "144", + "theme": "default", + "title": "Test invalid html", + "legal_basis": "Notice is given under section 999 of the Test Act 2000", + "metadata": [ + { + "name": "user_id", + "type": "string" + }, + { + "name": "period_id", + "type": "string" + }, + { + "name": "ru_name", + "type": "string" + }, + { + "name": "ru_ref", + "type": "string" + }, + { + "name": "trad_as", + "type": "string", + "optional": true + } + ], + "questionnaire_flow": { + "type": "Linear", + "options": {} + }, + "sections": [ + { + "id": "introduction-section", + "title": "Introduction", + "groups": [ + { + "id": "introduction-group", + "title": "General Business Information", + "blocks": [ + { + "id": "introduction", + "type": "Introduction", + "primary_content": [ + { + "id": "business-details", + "title": "Introduction with valid and invalid HTML", + "contents": [ + { + "guidance": { + "contents": [ + { + "title": "Coronavirus (COVID-19) guidance", + "description": "Explain your figures in the comment section to minimise us contacting you and to help us tell an industry story" + } + ] + } + } + ] + } + ] + }, + { + "type": "Interstitial", + "id": "intersitital-one", + "content": { + "title": "Page with invalid html", + "contents": [ + { + "description": "

You have successfully completed this section

" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-two", + "content": { + "title": "Page with link", + "contents": [ + { + "description": "Anchor" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-three", + "content": { + "title": "Page with mixed invalid tags", + "contents": [ + { + "description": "

Title

Not valid tag" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-four", + "content": { + "title": "Valid double strong with another strong.", + "contents": [ + { + "description": "TitleNot valid tag" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-five", + "content": { + "title": "Valid double anchor.", + "contents": [ + { + "description": "Title and Not valid tag" + } + ] + } + } + ] + } + ] + } + ] +} \ No newline at end of file From aecdd82a98263d4b6a06bbfd8e517b4af50949ff Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 15 Apr 2026 14:33:47 +0100 Subject: [PATCH 04/25] test invalid tag --- tests/test_questionnaire_validator.py | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 689ca3af..7aa9e2e2 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -333,6 +333,41 @@ def test_invalid_whitespaces_in_schema(): assert validator.errors == expected_error_messages +def test_invalid_html_in_schema(): + filename = "schemas/invalid/test_invalid_html_tags.json" + validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) + + expected_error_messages = [ + # { + # "message": error_messages.HTML_FOUND, + # "pointer": "/sections/0/groups/0/blocks/3/content/title", + # "text": "Page with mixed invalid tags", + # }, + # { + # "message": error_messages.HTML_FOUND, + # "pointer": "/sections/0/groups/0/blocks/1/content/contents/0/description", + # "text": "

You have successfully completed this section

", + # }, + # { + # "message": error_messages.HTML_FOUND, + # "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", + # "text": "

Title

Not valid tag", + # }, + # { + # "message": error_messages.HTML_FOUND, + # "pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description", + # "text": "TitleNot valid tag", + # }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", + "text": "Coronavirus (COVID-19) guidance", + }, + ] + validator.validate_html() + + assert validator.errors == expected_error_messages + def test_invalid_answer_type_for_question_summary_concatenation(): filename = "schemas/invalid/test_invalid_answer_type_for_question_summary.json" From 13d2916dd6a84f71f02fa3a68c6a2099598e60b7 Mon Sep 17 00:00:00 2001 From: oms09 Date: Mon, 20 Apr 2026 12:00:23 +0100 Subject: [PATCH 05/25] validate html entities --- app/validators/questionnaire_validator.py | 52 +++++++++++++++++------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 05152808..db81d1fb 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -4,7 +4,7 @@ Classes: QuestionnaireValidator """ - +import html.entities import re from eq_translations.survey_schema import SurveySchema from collections.abc import Mapping @@ -182,9 +182,15 @@ def validate_html(self): values_to_check = schema_text.values() for text in values_to_check: - if isinstance(text, str) and text and "<" in text and ">" in text: + if not isinstance(text, str) or not text: + continue + + if "<" in text and ">" in text: self.check_html_tags(text, translatable_item.pointer) + if "&" in text and ";" in text: + self.check_html_entities(text, translatable_item.pointer) + return def check_html_tags(self, text, pointer): @@ -200,7 +206,7 @@ def check_html_tags(self, text, pointer): self_closing_tags = {"br"} tag_matches = re.finditer(r"]*>", text) - stack = [] + stack = [] for match in tag_matches: #for each HTML tag found in the text raw_tag = match.group(0) @@ -238,17 +244,39 @@ def check_html_tags(self, text, pointer): text=text, ) - def validate_html_entities(self, text, pointer): - """Validate that there are no HTML entities in the translatable text fields of the questionnaire schema. Uses a - regular expression to search for occurrences of HTML entities in the text. + + def is_valid_html_entity(self, entity): + # Numeric entity + if entity.startswith("&#") and entity.endswith(";"): + numeric = entity[2:-1] - Args: - text (str): The text to be validated for HTML entities. - pointer (str): The JSON pointer indicating the location of the text in the questionnaire schema, used for - error reporting. - """ - return + try: + if numeric.lower().startswith("x"): + codepoint = int(numeric[1:], 16) + else: + codepoint = int(numeric) + except ValueError: + return False + + return 0 <= codepoint <= 0x10FFFF + + # Named entity + if entity.startswith("&") and entity.endswith(";"): + return entity[1:-1] in html.entities.html5 + + return False + + def check_html_entities(self, text, pointer): + entity_matches = re.findall(r"&[^;\s]+;", text) + for entity in entity_matches: + if not self.is_valid_html_entity(entity): + self.add_error( + error_messages.HTML_ENTITIES_FOUND, + pointer=pointer, + text=text, + ) + return def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text From b06f9b43109e9f192e81dee2577a2ecea554203c Mon Sep 17 00:00:00 2001 From: oms09 Date: Mon, 20 Apr 2026 12:01:25 +0100 Subject: [PATCH 06/25] html entity test --- .../invalid/test_invalid_html_tags.json | 6 +++--- tests/test_questionnaire_validator.py | 21 ++++++++++--------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json index ac03a8fc..560e1168 100644 --- a/tests/schemas/invalid/test_invalid_html_tags.json +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -41,7 +41,7 @@ "groups": [ { "id": "introduction-group", - "title": "General Business Information", + "title": "

General Business Information", "blocks": [ { "id": "introduction", @@ -49,14 +49,14 @@ "primary_content": [ { "id": "business-details", - "title": "Introduction with valid and invalid HTML", + "title": "Introduction with &fake; valid and invalid HTML", "contents": [ { "guidance": { "contents": [ { "title": "Coronavirus (COVID-19) guidance", - "description": "Explain your figures in the comment section to minimise us contacting you and to help us tell an industry story" + "description": "Explain your figures in the comment section to minimise us & contacting you and to help us tell an industry story" } ] } diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 7aa9e2e2..47d50c44 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -348,21 +348,22 @@ def test_invalid_html_in_schema(): # "pointer": "/sections/0/groups/0/blocks/1/content/contents/0/description", # "text": "

You have successfully completed this section

", # }, - # { - # "message": error_messages.HTML_FOUND, - # "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", - # "text": "

Title

Not valid tag", - # }, - # { - # "message": error_messages.HTML_FOUND, - # "pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description", - # "text": "TitleNot valid tag", - # }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/title", + "text": "

General Business Information", + }, + { + "message": error_messages.HTML_ENTITIES_FOUND, + "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/title", + "text": "Introduction with &fake; valid and invalid HTML", + }, { "message": error_messages.HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", "text": "Coronavirus (COVID-19) guidance", }, + ] validator.validate_html() From 3a9926716c6254aa78212af14957e99b72bde77c Mon Sep 17 00:00:00 2001 From: oms09 Date: Mon, 20 Apr 2026 12:13:00 +0100 Subject: [PATCH 07/25] update allowed tags --- app/validators/questionnaire_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index db81d1fb..b943f12f 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -202,7 +202,7 @@ def check_html_tags(self, text, pointer): error reporting. """ - allowed_tags = {"p", "strong", "a", "br", "em", "h1"} + allowed_tags = {"p", "strong", "a"} self_closing_tags = {"br"} tag_matches = re.finditer(r"]*>", text) From 6aa9e962a2824cbe36bd587a9d9c9249d0845fa3 Mon Sep 17 00:00:00 2001 From: oms09 Date: Tue, 21 Apr 2026 18:49:16 +0100 Subject: [PATCH 08/25] updated allowed tags --- app/validators/questionnaire_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index b943f12f..cb54e0d5 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -202,7 +202,7 @@ def check_html_tags(self, text, pointer): error reporting. """ - allowed_tags = {"p", "strong", "a"} + allowed_tags = {"p", "strong", "a", "b"} self_closing_tags = {"br"} tag_matches = re.finditer(r"]*>", text) From e1413b8f73d24a5a802cfb9f491c01a5ffbbed7e Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 12:26:53 +0100 Subject: [PATCH 09/25] update slice removing last char --- app/validators/questionnaire_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index cb54e0d5..583e2159 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -262,7 +262,7 @@ def is_valid_html_entity(self, entity): # Named entity if entity.startswith("&") and entity.endswith(";"): - return entity[1:-1] in html.entities.html5 + return entity[1:] in html.entities.html5 return False From b1b7b7e49c08268a7d551ab4952845ca49953385 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 13:10:42 +0100 Subject: [PATCH 10/25] update unit test --- tests/test_questionnaire_validator.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 47d50c44..c9f79f18 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -338,21 +338,21 @@ def test_invalid_html_in_schema(): validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) expected_error_messages = [ - # { - # "message": error_messages.HTML_FOUND, - # "pointer": "/sections/0/groups/0/blocks/3/content/title", - # "text": "Page with mixed invalid tags", - # }, - # { - # "message": error_messages.HTML_FOUND, - # "pointer": "/sections/0/groups/0/blocks/1/content/contents/0/description", - # "text": "

You have successfully completed this section

", - # }, { "message": error_messages.HTML_FOUND, "pointer": "/sections/0/groups/0/title", "text": "

General Business Information", }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", + "text": "

Title

Not valid tag", + }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description", + "text": "TitleNot valid tag", + }, { "message": error_messages.HTML_ENTITIES_FOUND, "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/title", @@ -363,7 +363,6 @@ def test_invalid_html_in_schema(): "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", "text": "Coronavirus (COVID-19) guidance", }, - ] validator.validate_html() From cc3dedd9a63d619c2469b2c94f832e523a89291b Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 14:01:56 +0100 Subject: [PATCH 11/25] space before br tag --- app/error_messages.py | 1 + tests/schemas/invalid/test_invalid_html_tags.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/app/error_messages.py b/app/error_messages.py index 1dd79854..d406c937 100644 --- a/app/error_messages.py +++ b/app/error_messages.py @@ -3,6 +3,7 @@ DUMB_QUOTES_FOUND = "Found dumb quotes(s) in schema text" HTML_FOUND = "Found invalid HTML tag(s) in schema text" HTML_ENTITIES_FOUND = "Found invalid HTML entity(ies) in schema text" +SPACE_BEFORE_BR = "Found whitespace before
tag" INVALID_WHITESPACE_FOUND = "Found invalid white space(s) in schema text" DUPLICATE_ID_FOUND = "Duplicate id found" FOR_LIST_NEVER_POPULATED = "for_list is not populated by any ListCollector blocks or supplementary data sources" diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json index 560e1168..233b33f9 100644 --- a/tests/schemas/invalid/test_invalid_html_tags.json +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -37,7 +37,7 @@ "sections": [ { "id": "introduction-section", - "title": "Introduction", + "title": "Introduction
", "groups": [ { "id": "introduction-group", From adac41b3d63d3c3734fe1112b67b067fd0f3575a Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 14:02:30 +0100 Subject: [PATCH 12/25] update tags & check br whitespace --- app/validators/questionnaire_validator.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 583e2159..cad34be9 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -187,6 +187,7 @@ def validate_html(self): if "<" in text and ">" in text: self.check_html_tags(text, translatable_item.pointer) + self.check_br_tag_whitespace(text, translatable_item.pointer) if "&" in text and ";" in text: self.check_html_entities(text, translatable_item.pointer) @@ -202,8 +203,8 @@ def check_html_tags(self, text, pointer): error reporting. """ - allowed_tags = {"p", "strong", "a", "b"} - self_closing_tags = {"br"} + allowed_tags = {"p", "strong", "a", "b", "br", "img"} + self_closing_tags = {"br", "img"} tag_matches = re.finditer(r"]*>", text) stack = [] @@ -277,6 +278,15 @@ def check_html_entities(self, text, pointer): text=text, ) return + + def check_br_tag_whitespace(self, text, pointer): + if re.search(r"\s+", text): + self.add_error( + error_messages.SPACE_BEFORE_BR, + pointer=pointer, + text=text, + ) + def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text From 72389243056631874322db322c6bab99e2e9afec Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 14:02:57 +0100 Subject: [PATCH 13/25] unit test for whitespace before
--- tests/test_questionnaire_validator.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index c9f79f18..7c4e98d9 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -338,6 +338,11 @@ def test_invalid_html_in_schema(): validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) expected_error_messages = [ + { + "message": error_messages.SPACE_BEFORE_BR, + "pointer": "/sections/0/title", + "text": "Introduction
", + }, { "message": error_messages.HTML_FOUND, "pointer": "/sections/0/groups/0/title", From e869b2e713e9e33977f231090ed536b2c5971273 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 14:14:30 +0100 Subject: [PATCH 14/25] lint & refactor --- app/validators/questionnaire_validator.py | 43 ++++++++----------- .../invalid/test_invalid_html_tags.json | 2 +- tests/test_questionnaire_validator.py | 1 + 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index cad34be9..338da9f1 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -4,11 +4,13 @@ Classes: QuestionnaireValidator """ + import html.entities import re -from eq_translations.survey_schema import SurveySchema from collections.abc import Mapping +from eq_translations.survey_schema import SurveySchema + from app import error_messages from app.validators.answer_code_validator import AnswerCodeValidator from app.validators.metadata_validator import MetadataValidator @@ -168,6 +170,7 @@ def validate_smart_quotes(self): error_messages.DUMB_QUOTES_FOUND, pointer=translatable_item.pointer, ) + def validate_html(self): # loop over translatable strings # call check_html_tags(text, pointer) @@ -190,9 +193,7 @@ def validate_html(self): self.check_br_tag_whitespace(text, translatable_item.pointer) if "&" in text and ";" in text: - self.check_html_entities(text, translatable_item.pointer) - - return + self.check_html_entities(text, translatable_item.pointer) def check_html_tags(self, text, pointer): """Checks valid html tags. @@ -202,29 +203,28 @@ def check_html_tags(self, text, pointer): pointer (str): The JSON pointer indicating the location of the text in the questionnaire schema, used for error reporting. """ - allowed_tags = {"p", "strong", "a", "b", "br", "img"} self_closing_tags = {"br", "img"} - + tag_matches = re.finditer(r"]*>", text) - stack = [] - - for match in tag_matches: #for each HTML tag found in the text + stack = [] + + for match in tag_matches: raw_tag = match.group(0) tag_name = match.group(1).lower() is_closing = raw_tag.startswith("") or tag_name in self_closing_tags - if tag_name not in allowed_tags: # invalid html tag found + if tag_name not in allowed_tags: self.add_error( error_messages.HTML_FOUND, pointer=pointer, text=text, ) return - - if is_closing: #closed tag found, pop + + if is_closing: if tag_name in self_closing_tags or not stack or stack[-1] != tag_name: self.add_error( error_messages.HTML_FOUND, @@ -235,7 +235,7 @@ def check_html_tags(self, text, pointer): stack.pop() - elif not is_self_closing:#open tag not void elem + elif not is_self_closing: stack.append(tag_name) if stack: @@ -245,28 +245,22 @@ def check_html_tags(self, text, pointer): text=text, ) - def is_valid_html_entity(self, entity): - # Numeric entity if entity.startswith("&#") and entity.endswith(";"): numeric = entity[2:-1] try: - if numeric.lower().startswith("x"): - codepoint = int(numeric[1:], 16) - else: - codepoint = int(numeric) + numeric_value = int(numeric[1:], 16) if numeric.lower().startswith("x") else int(numeric) except ValueError: return False - return 0 <= codepoint <= 0x10FFFF + return 0 <= numeric_value <= 0x10FFFF - # Named entity if entity.startswith("&") and entity.endswith(";"): return entity[1:] in html.entities.html5 return False - + def check_html_entities(self, text, pointer): entity_matches = re.findall(r"&[^;\s]+;", text) @@ -277,8 +271,8 @@ def check_html_entities(self, text, pointer): pointer=pointer, text=text, ) - return - + return + def check_br_tag_whitespace(self, text, pointer): if re.search(r"\s+", text): self.add_error( @@ -287,7 +281,6 @@ def check_br_tag_whitespace(self, text, pointer): text=text, ) - def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text of the questionnaire schema. diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json index 233b33f9..3ce6e600 100644 --- a/tests/schemas/invalid/test_invalid_html_tags.json +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -130,4 +130,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 7c4e98d9..699de884 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -333,6 +333,7 @@ def test_invalid_whitespaces_in_schema(): assert validator.errors == expected_error_messages + def test_invalid_html_in_schema(): filename = "schemas/invalid/test_invalid_html_tags.json" validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) From 6a26f9b3859791df98c8d02d9bc47c80c5a0e701 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 14:19:28 +0100 Subject: [PATCH 15/25] update hex entity var name --- app/validators/questionnaire_validator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 338da9f1..4ad8bf38 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -249,8 +249,10 @@ def is_valid_html_entity(self, entity): if entity.startswith("&#") and entity.endswith(";"): numeric = entity[2:-1] + is_hex = numeric.lower().startswith("x") + try: - numeric_value = int(numeric[1:], 16) if numeric.lower().startswith("x") else int(numeric) + numeric_value = int(numeric[1:], 16) if is_hex else int(numeric) except ValueError: return False From 8fa1a27c1c5ebe96d1313ffdf5a643b7fa604a7f Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 14:54:20 +0100 Subject: [PATCH 16/25] eq_translation error updated get_translatable_items --- app/validators/questionnaire_validator.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 4ad8bf38..191fd429 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -9,8 +9,6 @@ import re from collections.abc import Mapping -from eq_translations.survey_schema import SurveySchema - from app import error_messages from app.validators.answer_code_validator import AnswerCodeValidator from app.validators.metadata_validator import MetadataValidator @@ -175,9 +173,8 @@ def validate_html(self): # loop over translatable strings # call check_html_tags(text, pointer) - schema_object = SurveySchema(self.schema_element) - for translatable_item in schema_object.translatable_items: + for translatable_item in get_translatable_items(self.schema_element): # type: ignore schema_text = translatable_item.value values_to_check = [schema_text] From d0e69de9e25ba0a34438851a6dfa838d6f3d7aef Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 15:29:23 +0100 Subject: [PATCH 17/25] check p tag position --- app/validators/questionnaire_validator.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 191fd429..b0f6349b 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -188,6 +188,7 @@ def validate_html(self): if "<" in text and ">" in text: self.check_html_tags(text, translatable_item.pointer) self.check_br_tag_whitespace(text, translatable_item.pointer) + self.check_p_tag_position(text, translatable_item.pointer) if "&" in text and ";" in text: self.check_html_entities(text, translatable_item.pointer) @@ -280,6 +281,22 @@ def check_br_tag_whitespace(self, text, pointer): text=text, ) + def check_p_tag_position(self, text, pointer): + match = re.search(r"])[^>]*>", text) + + if not match: + return + + text_before_p = text[:match.start()] + + content_before_p = text_before_p.strip(" \t\n\r[]()") + + if content_before_p: + self.add_error( + error_messages.HTML_FOUND, + pointer=pointer, + text=text, + ) def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text of the questionnaire schema. From 6406ebea1f12c65c97d8adc01437e87e26a192b7 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 15:29:49 +0100 Subject: [PATCH 18/25] p tag position test --- tests/schemas/invalid/test_invalid_html_tags.json | 4 ++-- tests/test_questionnaire_validator.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json index 3ce6e600..d618088c 100644 --- a/tests/schemas/invalid/test_invalid_html_tags.json +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -69,10 +69,10 @@ "type": "Interstitial", "id": "intersitital-one", "content": { - "title": "Page with invalid html", + "title": "Page

with invalid html

in title", "contents": [ { - "description": "

You have successfully completed this section

" + "description": "[

You have successfully completed this section

]" } ] } diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 699de884..89a5e56c 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -349,6 +349,11 @@ def test_invalid_html_in_schema(): "pointer": "/sections/0/groups/0/title", "text": "

General Business Information", }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/1/content/title", + "text": "Page

with invalid html

in title", + }, { "message": error_messages.HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", From a31dbae15ec28cfef0f118d0e79fa0c58b561ee8 Mon Sep 17 00:00:00 2001 From: oms09 Date: Wed, 22 Apr 2026 17:18:01 +0100 Subject: [PATCH 19/25] docstring & lint --- app/validators/questionnaire_validator.py | 39 +++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index b0f6349b..25df72df 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -170,10 +170,9 @@ def validate_smart_quotes(self): ) def validate_html(self): - # loop over translatable strings - # call check_html_tags(text, pointer) - - + """Validates HTML in translatable schema text. + Checks tags, entities,
whitespace, and

positioning. + """ for translatable_item in get_translatable_items(self.schema_element): # type: ignore schema_text = translatable_item.value values_to_check = [schema_text] @@ -194,7 +193,7 @@ def validate_html(self): self.check_html_entities(text, translatable_item.pointer) def check_html_tags(self, text, pointer): - """Checks valid html tags. + """Validates HTML tags. Args: text (str): The text to be validated for HTML tags. @@ -244,6 +243,12 @@ def check_html_tags(self, text, pointer): ) def is_valid_html_entity(self, entity): + """Checks whether a given HTML entity is valid. + Supports both numeric (decimal and hexadecimal) and named entities. + + Args: + entity (str): The HTML entity to validate (e.g. "&", "©"). + """ if entity.startswith("&#") and entity.endswith(";"): numeric = entity[2:-1] @@ -262,6 +267,14 @@ def is_valid_html_entity(self, entity): return False def check_html_entities(self, text, pointer): + """Validates HTML entities found in the text. + + Extracts all entities and checks whether each one is valid. + + Args: + text (str): The text to validate for HTML entities. + pointer (str): JSON pointer to the location of the text in the schema. + """ entity_matches = re.findall(r"&[^;\s]+;", text) for entity in entity_matches: @@ -274,6 +287,12 @@ def check_html_entities(self, text, pointer): return def check_br_tag_whitespace(self, text, pointer): + """Checks for invalid whitespace before
tags. + + Args: + text (str): The text to validate. + pointer (str): JSON pointer to the location of the text in the schema. + """ if re.search(r"\s+", text): self.add_error( error_messages.SPACE_BEFORE_BR, @@ -282,12 +301,19 @@ def check_br_tag_whitespace(self, text, pointer): ) def check_p_tag_position(self, text, pointer): + """Checks if p tag is at the start of a sentence + (ignoring whitespace and wrapper characters like [] and ()). + + Args: + text (str): The text to validate. + pointer (str): JSON pointer to the location of the text in the schema. + """ match = re.search(r"])[^>]*>", text) if not match: return - text_before_p = text[:match.start()] + text_before_p = text[: match.start()] content_before_p = text_before_p.strip(" \t\n\r[]()") @@ -297,6 +323,7 @@ def check_p_tag_position(self, text, pointer): pointer=pointer, text=text, ) + def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text of the questionnaire schema. From be693998eb4a088f9757977cee4880e9603c4bf2 Mon Sep 17 00:00:00 2001 From: oms09 Date: Thu, 7 May 2026 11:20:20 +0100 Subject: [PATCH 20/25] update docstring and error message name --- app/validators/questionnaire_validator.py | 11 ++++++----- tests/test_questionnaire_validator.py | 12 ++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 25df72df..0cf85674 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -42,6 +42,7 @@ class QuestionnaireValidator(Validator): validate_smart_quotes validate_white_spaces validate_introduction_block + validate_html validate_answer_references validate_list_references resolve_source_block_id @@ -215,7 +216,7 @@ def check_html_tags(self, text, pointer): if tag_name not in allowed_tags: self.add_error( - error_messages.HTML_FOUND, + error_messages.INVALID_HTML_FOUND, pointer=pointer, text=text, ) @@ -224,7 +225,7 @@ def check_html_tags(self, text, pointer): if is_closing: if tag_name in self_closing_tags or not stack or stack[-1] != tag_name: self.add_error( - error_messages.HTML_FOUND, + error_messages.INVALID_HTML_FOUND, pointer=pointer, text=text, ) @@ -237,7 +238,7 @@ def check_html_tags(self, text, pointer): if stack: self.add_error( - error_messages.HTML_FOUND, + error_messages.INVALID_HTML_FOUND, pointer=pointer, text=text, ) @@ -280,7 +281,7 @@ def check_html_entities(self, text, pointer): for entity in entity_matches: if not self.is_valid_html_entity(entity): self.add_error( - error_messages.HTML_ENTITIES_FOUND, + error_messages.INVALID_HTML_ENTITIES_FOUND, pointer=pointer, text=text, ) @@ -319,7 +320,7 @@ def check_p_tag_position(self, text, pointer): if content_before_p: self.add_error( - error_messages.HTML_FOUND, + error_messages.INVALID_HTML_FOUND, pointer=pointer, text=text, ) diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 89a5e56c..a42fd703 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -345,32 +345,32 @@ def test_invalid_html_in_schema(): "text": "Introduction
", }, { - "message": error_messages.HTML_FOUND, + "message": error_messages.INVALID_HTML_FOUND, "pointer": "/sections/0/groups/0/title", "text": "

General Business Information", }, { - "message": error_messages.HTML_FOUND, + "message": error_messages.INVALID_HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/1/content/title", "text": "Page

with invalid html

in title", }, { - "message": error_messages.HTML_FOUND, + "message": error_messages.INVALID_HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", "text": "

Title

Not valid tag", }, { - "message": error_messages.HTML_FOUND, + "message": error_messages.INVALID_HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description", "text": "TitleNot valid tag", }, { - "message": error_messages.HTML_ENTITIES_FOUND, + "message": error_messages.INVALID_HTML_ENTITIES_FOUND, "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/title", "text": "Introduction with &fake; valid and invalid HTML", }, { - "message": error_messages.HTML_FOUND, + "message": error_messages.INVALID_HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", "text": "Coronavirus (COVID-19) guidance", }, From 717470c9f84049319d00f6560a679c2a8b2418b1 Mon Sep 17 00:00:00 2001 From: oms09 Date: Thu, 7 May 2026 11:25:52 +0100 Subject: [PATCH 21/25] error msg update --- app/error_messages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/error_messages.py b/app/error_messages.py index d406c937..de038f26 100644 --- a/app/error_messages.py +++ b/app/error_messages.py @@ -1,8 +1,8 @@ """Error messages used in validators and exceptions throughout the validator codebase.""" DUMB_QUOTES_FOUND = "Found dumb quotes(s) in schema text" -HTML_FOUND = "Found invalid HTML tag(s) in schema text" -HTML_ENTITIES_FOUND = "Found invalid HTML entity(ies) in schema text" +INVALID_HTML_FOUND = "Found invalid HTML tag(s) in schema text" +INVALID_HTML_ENTITIES_FOUND = "Found invalid HTML entity(ies) in schema text" SPACE_BEFORE_BR = "Found whitespace before
tag" INVALID_WHITESPACE_FOUND = "Found invalid white space(s) in schema text" DUPLICATE_ID_FOUND = "Duplicate id found" From 7660153fc81ce345e0598c0f24b57c71bc56b9fa Mon Sep 17 00:00:00 2001 From: oms09 Date: Tue, 12 May 2026 11:58:55 +0100 Subject: [PATCH 22/25] update msg & indent --- app/validators/questionnaire_validator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 0cf85674..4bf1433a 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -306,19 +306,19 @@ def check_p_tag_position(self, text, pointer): (ignoring whitespace and wrapper characters like [] and ()). Args: - text (str): The text to validate. - pointer (str): JSON pointer to the location of the text in the schema. + text (str): The text to validate. + pointer (str): JSON pointer to the location of the text in the schema. """ match = re.search(r"])[^>]*>", text) if not match: return - text_before_p = text[: match.start()] + text_before_p_tag = text[: match.start()] - content_before_p = text_before_p.strip(" \t\n\r[]()") + content_before_p_tag = text_before_p_tag.strip(" \t\n\r[]()") - if content_before_p: + if content_before_p_tag: self.add_error( error_messages.INVALID_HTML_FOUND, pointer=pointer, From 6bdc4a50313fbb8279f08720fc5d41c4f6ec33c8 Mon Sep 17 00:00:00 2001 From: oms09 Date: Fri, 15 May 2026 12:02:57 +0100 Subject: [PATCH 23/25] docstring n name change --- app/validators/questionnaire_validator.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 4bf1433a..46cfd24f 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -43,6 +43,11 @@ class QuestionnaireValidator(Validator): validate_white_spaces validate_introduction_block validate_html + validate_html_tags + validate_html_entities + is_valid_html_entity + validate_br_tag_whitespace + validate_p_tag_position validate_answer_references validate_list_references resolve_source_block_id @@ -186,14 +191,14 @@ def validate_html(self): continue if "<" in text and ">" in text: - self.check_html_tags(text, translatable_item.pointer) - self.check_br_tag_whitespace(text, translatable_item.pointer) - self.check_p_tag_position(text, translatable_item.pointer) + self.validate_html_tags(text, translatable_item.pointer) + self.validate_br_tag_whitespace(text, translatable_item.pointer) + self.validate_p_tag_position(text, translatable_item.pointer) if "&" in text and ";" in text: - self.check_html_entities(text, translatable_item.pointer) + self.validate_html_entities(text, translatable_item.pointer) - def check_html_tags(self, text, pointer): + def validate_html_tags(self, text, pointer): """Validates HTML tags. Args: @@ -267,7 +272,7 @@ def is_valid_html_entity(self, entity): return False - def check_html_entities(self, text, pointer): + def validate_html_entities(self, text, pointer): """Validates HTML entities found in the text. Extracts all entities and checks whether each one is valid. @@ -287,7 +292,7 @@ def check_html_entities(self, text, pointer): ) return - def check_br_tag_whitespace(self, text, pointer): + def validate_br_tag_whitespace(self, text, pointer): """Checks for invalid whitespace before
tags. Args: @@ -301,7 +306,7 @@ def check_br_tag_whitespace(self, text, pointer): text=text, ) - def check_p_tag_position(self, text, pointer): + def validate_p_tag_position(self, text, pointer): """Checks if p tag is at the start of a sentence (ignoring whitespace and wrapper characters like [] and ()). From 54c277d5fd690c6cade7646a2d9f281acd8e7dee Mon Sep 17 00:00:00 2001 From: oms09 Date: Fri, 15 May 2026 12:14:16 +0100 Subject: [PATCH 24/25] update test text --- tests/schemas/invalid/test_invalid_html_tags.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json index d618088c..a6255f06 100644 --- a/tests/schemas/invalid/test_invalid_html_tags.json +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -120,7 +120,7 @@ "title": "Valid double anchor.", "contents": [ { - "description": "Title and Not valid tag" + "description": "Title and valid tag" } ] } From 6735d8b04fa0c76426d8baa01c9a00e3c9115780 Mon Sep 17 00:00:00 2001 From: oms09 Date: Fri, 15 May 2026 12:18:33 +0100 Subject: [PATCH 25/25] br tag whitespace name --- app/validators/questionnaire_validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 46cfd24f..bb1ea799 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -46,7 +46,7 @@ class QuestionnaireValidator(Validator): validate_html_tags validate_html_entities is_valid_html_entity - validate_br_tag_whitespace + validate_whitespace_before_br_tag validate_p_tag_position validate_answer_references validate_list_references @@ -192,7 +192,7 @@ def validate_html(self): if "<" in text and ">" in text: self.validate_html_tags(text, translatable_item.pointer) - self.validate_br_tag_whitespace(text, translatable_item.pointer) + self.validate_whitespace_before_br_tag(text, translatable_item.pointer) self.validate_p_tag_position(text, translatable_item.pointer) if "&" in text and ";" in text: @@ -292,7 +292,7 @@ def validate_html_entities(self, text, pointer): ) return - def validate_br_tag_whitespace(self, text, pointer): + def validate_whitespace_before_br_tag(self, text, pointer): """Checks for invalid whitespace before
tags. Args: