diff --git a/app/error_messages.py b/app/error_messages.py index 188a9947d..de038f26e 100644 --- a/app/error_messages.py +++ b/app/error_messages.py @@ -1,6 +1,9 @@ """Error messages used in validators and exceptions throughout the validator codebase.""" DUMB_QUOTES_FOUND = "Found dumb quotes(s) in schema text" +INVALID_HTML_FOUND = "Found invalid HTML tag(s) in schema text" +INVALID_HTML_ENTITIES_FOUND = "Found invalid HTML entity(ies) in schema text" +SPACE_BEFORE_BR = "Found whitespace before
tag" INVALID_WHITESPACE_FOUND = "Found invalid white space(s) in schema text" DUPLICATE_ID_FOUND = "Duplicate id found" FOR_LIST_NEVER_POPULATED = "for_list is not populated by any ListCollector blocks or supplementary data sources" diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 27626c94c..bb1ea799d 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -5,6 +5,7 @@ QuestionnaireValidator """ +import html.entities import re from collections.abc import Mapping @@ -41,6 +42,12 @@ class QuestionnaireValidator(Validator): validate_smart_quotes validate_white_spaces validate_introduction_block + validate_html + validate_html_tags + validate_html_entities + is_valid_html_entity + validate_whitespace_before_br_tag + validate_p_tag_position validate_answer_references validate_list_references resolve_source_block_id @@ -70,6 +77,7 @@ def validate(self): self.validate_duplicates() self.validate_smart_quotes() self.validate_white_spaces() + self.validate_html() self.validate_answer_references() self.validate_list_references() @@ -167,6 +175,161 @@ def validate_smart_quotes(self): pointer=translatable_item.pointer, ) + def validate_html(self): + """Validates HTML in translatable schema text. + Checks tags, entities,
whitespace, and

positioning. + """ + for translatable_item in get_translatable_items(self.schema_element): # type: ignore + schema_text = translatable_item.value + values_to_check = [schema_text] + + if isinstance(schema_text, dict): + values_to_check = schema_text.values() + + for text in values_to_check: + if not isinstance(text, str) or not text: + continue + + if "<" in text and ">" in text: + self.validate_html_tags(text, translatable_item.pointer) + self.validate_whitespace_before_br_tag(text, translatable_item.pointer) + self.validate_p_tag_position(text, translatable_item.pointer) + + if "&" in text and ";" in text: + self.validate_html_entities(text, translatable_item.pointer) + + def validate_html_tags(self, text, pointer): + """Validates HTML tags. + + Args: + text (str): The text to be validated for HTML tags. + pointer (str): The JSON pointer indicating the location of the text in the questionnaire schema, used for + error reporting. + """ + allowed_tags = {"p", "strong", "a", "b", "br", "img"} + self_closing_tags = {"br", "img"} + + tag_matches = re.finditer(r"]*>", text) + stack = [] + + for match in tag_matches: + raw_tag = match.group(0) + tag_name = match.group(1).lower() + + is_closing = raw_tag.startswith("") or tag_name in self_closing_tags + + if tag_name not in allowed_tags: + self.add_error( + error_messages.INVALID_HTML_FOUND, + pointer=pointer, + text=text, + ) + return + + if is_closing: + if tag_name in self_closing_tags or not stack or stack[-1] != tag_name: + self.add_error( + error_messages.INVALID_HTML_FOUND, + pointer=pointer, + text=text, + ) + return + + stack.pop() + + elif not is_self_closing: + stack.append(tag_name) + + if stack: + self.add_error( + error_messages.INVALID_HTML_FOUND, + pointer=pointer, + text=text, + ) + + def is_valid_html_entity(self, entity): + """Checks whether a given HTML entity is valid. + Supports both numeric (decimal and hexadecimal) and named entities. + + Args: + entity (str): The HTML entity to validate (e.g. "&", "©"). + """ + if entity.startswith("&#") and entity.endswith(";"): + numeric = entity[2:-1] + + is_hex = numeric.lower().startswith("x") + + try: + numeric_value = int(numeric[1:], 16) if is_hex else int(numeric) + except ValueError: + return False + + return 0 <= numeric_value <= 0x10FFFF + + if entity.startswith("&") and entity.endswith(";"): + return entity[1:] in html.entities.html5 + + return False + + def validate_html_entities(self, text, pointer): + """Validates HTML entities found in the text. + + Extracts all entities and checks whether each one is valid. + + Args: + text (str): The text to validate for HTML entities. + pointer (str): JSON pointer to the location of the text in the schema. + """ + entity_matches = re.findall(r"&[^;\s]+;", text) + + for entity in entity_matches: + if not self.is_valid_html_entity(entity): + self.add_error( + error_messages.INVALID_HTML_ENTITIES_FOUND, + pointer=pointer, + text=text, + ) + return + + def validate_whitespace_before_br_tag(self, text, pointer): + """Checks for invalid whitespace before
tags. + + Args: + text (str): The text to validate. + pointer (str): JSON pointer to the location of the text in the schema. + """ + if re.search(r"\s+", text): + self.add_error( + error_messages.SPACE_BEFORE_BR, + pointer=pointer, + text=text, + ) + + def validate_p_tag_position(self, text, pointer): + """Checks if p tag is at the start of a sentence + (ignoring whitespace and wrapper characters like [] and ()). + + Args: + text (str): The text to validate. + pointer (str): JSON pointer to the location of the text in the schema. + """ + match = re.search(r"])[^>]*>", text) + + if not match: + return + + text_before_p_tag = text[: match.start()] + + content_before_p_tag = text_before_p_tag.strip(" \t\n\r[]()") + + if content_before_p_tag: + self.add_error( + error_messages.INVALID_HTML_FOUND, + pointer=pointer, + text=text, + ) + def validate_white_spaces(self): """Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text of the questionnaire schema. diff --git a/tests/schemas/invalid/test_invalid_html_tags.json b/tests/schemas/invalid/test_invalid_html_tags.json new file mode 100644 index 000000000..a6255f06d --- /dev/null +++ b/tests/schemas/invalid/test_invalid_html_tags.json @@ -0,0 +1,133 @@ +{ + "mime_type": "application/json/ons/eq", + "language": "en", + "schema_version": "0.0.1", + "data_version": "0.0.3", + "survey_id": "144", + "theme": "default", + "title": "Test invalid html", + "legal_basis": "Notice is given under section 999 of the Test Act 2000", + "metadata": [ + { + "name": "user_id", + "type": "string" + }, + { + "name": "period_id", + "type": "string" + }, + { + "name": "ru_name", + "type": "string" + }, + { + "name": "ru_ref", + "type": "string" + }, + { + "name": "trad_as", + "type": "string", + "optional": true + } + ], + "questionnaire_flow": { + "type": "Linear", + "options": {} + }, + "sections": [ + { + "id": "introduction-section", + "title": "Introduction
", + "groups": [ + { + "id": "introduction-group", + "title": "

General Business Information", + "blocks": [ + { + "id": "introduction", + "type": "Introduction", + "primary_content": [ + { + "id": "business-details", + "title": "Introduction with &fake; valid and invalid HTML", + "contents": [ + { + "guidance": { + "contents": [ + { + "title": "Coronavirus (COVID-19) guidance", + "description": "Explain your figures in the comment section to minimise us & contacting you and to help us tell an industry story" + } + ] + } + } + ] + } + ] + }, + { + "type": "Interstitial", + "id": "intersitital-one", + "content": { + "title": "Page

with invalid html

in title", + "contents": [ + { + "description": "[

You have successfully completed this section

]" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-two", + "content": { + "title": "Page with link", + "contents": [ + { + "description": "Anchor" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-three", + "content": { + "title": "Page with mixed invalid tags", + "contents": [ + { + "description": "

Title

Not valid tag" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-four", + "content": { + "title": "Valid double strong with another strong.", + "contents": [ + { + "description": "TitleNot valid tag" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-five", + "content": { + "title": "Valid double anchor.", + "contents": [ + { + "description": "Title and valid tag" + } + ] + } + } + ] + } + ] + } + ] +} diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 689ca3afa..a42fd7034 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -334,6 +334,52 @@ def test_invalid_whitespaces_in_schema(): assert validator.errors == expected_error_messages +def test_invalid_html_in_schema(): + filename = "schemas/invalid/test_invalid_html_tags.json" + validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) + + expected_error_messages = [ + { + "message": error_messages.SPACE_BEFORE_BR, + "pointer": "/sections/0/title", + "text": "Introduction
", + }, + { + "message": error_messages.INVALID_HTML_FOUND, + "pointer": "/sections/0/groups/0/title", + "text": "

General Business Information", + }, + { + "message": error_messages.INVALID_HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/1/content/title", + "text": "Page

with invalid html

in title", + }, + { + "message": error_messages.INVALID_HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", + "text": "

Title

Not valid tag", + }, + { + "message": error_messages.INVALID_HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description", + "text": "TitleNot valid tag", + }, + { + "message": error_messages.INVALID_HTML_ENTITIES_FOUND, + "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/title", + "text": "Introduction with &fake; valid and invalid HTML", + }, + { + "message": error_messages.INVALID_HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", + "text": "Coronavirus (COVID-19) guidance", + }, + ] + validator.validate_html() + + assert validator.errors == expected_error_messages + + def test_invalid_answer_type_for_question_summary_concatenation(): filename = "schemas/invalid/test_invalid_answer_type_for_question_summary.json"