Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e7a0c31
stack based html tag validation
oms09 Apr 15, 2026
af6fd13
error msg
oms09 Apr 15, 2026
a00a83d
invalid html tag test
oms09 Apr 15, 2026
aecdd82
test invalid tag
oms09 Apr 15, 2026
13d2916
validate html entities
oms09 Apr 20, 2026
b06f9b4
html entity test
oms09 Apr 20, 2026
3a99267
update allowed tags
oms09 Apr 20, 2026
6aa9e96
updated allowed tags
oms09 Apr 21, 2026
e1413b8
update slice removing last char
oms09 Apr 22, 2026
b1b7b7e
update unit test
oms09 Apr 22, 2026
cc3dedd
space before br tag
oms09 Apr 22, 2026
adac41b
update tags & check br whitespace
oms09 Apr 22, 2026
7238924
unit test for whitespace before <br>
oms09 Apr 22, 2026
12efea7
Merge branch 'main' into eqs-766-add-html-validation
oms09 Apr 22, 2026
e869b2e
lint & refactor
oms09 Apr 22, 2026
b3e0b4c
Merge branch 'eqs-766-add-html-validation' of github.com:ONSdigital/e…
oms09 Apr 22, 2026
6a26f9b
update hex entity var name
oms09 Apr 22, 2026
8fa1a27
eq_translation error updated get_translatable_items
oms09 Apr 22, 2026
d0e69de
check p tag position
oms09 Apr 22, 2026
6406ebe
p tag position test
oms09 Apr 22, 2026
a31dbae
docstring & lint
oms09 Apr 22, 2026
5d0851b
Merge branch 'main' into eqs-766-add-html-validation
oms09 Apr 22, 2026
adbc7a4
Merge branch 'main' into eqs-766-add-html-validation
oms09 Apr 22, 2026
be69399
update docstring and error message name
oms09 May 7, 2026
717470c
error msg update
oms09 May 7, 2026
3396946
Merge branch 'main' into eqs-766-add-html-validation
oms09 May 7, 2026
8f52fe8
Merge branch 'main' into eqs-766-add-html-validation
oms09 May 7, 2026
40885e2
Merge branch 'main' into eqs-766-add-html-validation
oms09 May 8, 2026
7660153
update msg & indent
oms09 May 12, 2026
dadf743
Merge branch 'main' into eqs-766-add-html-validation
oms09 May 12, 2026
6bdc4a5
docstring n name change
oms09 May 15, 2026
4867524
Merge branch 'eqs-766-add-html-validation' of github.com:ONSdigital/e…
oms09 May 15, 2026
54c277d
update test text
oms09 May 15, 2026
6735d8b
br tag whitespace name
oms09 May 15, 2026
d928bfa
Merge branch 'main' into eqs-766-add-html-validation
oms09 May 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions app/error_messages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Error messages used in validators and exceptions throughout the validator codebase."""

DUMB_QUOTES_FOUND = "Found dumb quotes(s) in schema text"
INVALID_HTML_FOUND = "Found invalid HTML tag(s) in schema text"
INVALID_HTML_ENTITIES_FOUND = "Found invalid HTML entity(ies) in schema text"
SPACE_BEFORE_BR = "Found whitespace before <br> tag"
INVALID_WHITESPACE_FOUND = "Found invalid white space(s) in schema text"
DUPLICATE_ID_FOUND = "Duplicate id found"
FOR_LIST_NEVER_POPULATED = "for_list is not populated by any ListCollector blocks or supplementary data sources"
Expand Down
163 changes: 163 additions & 0 deletions app/validators/questionnaire_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
QuestionnaireValidator
"""

import html.entities
import re
from collections.abc import Mapping

Expand Down Expand Up @@ -41,6 +42,12 @@ class QuestionnaireValidator(Validator):
validate_smart_quotes
validate_white_spaces
validate_introduction_block
validate_html
Comment thread
VirajP1002 marked this conversation as resolved.
validate_html_tags
validate_html_entities
is_valid_html_entity
validate_whitespace_before_br_tag
validate_p_tag_position
validate_answer_references
validate_list_references
resolve_source_block_id
Expand Down Expand Up @@ -70,6 +77,7 @@ def validate(self):
self.validate_duplicates()
self.validate_smart_quotes()
self.validate_white_spaces()
self.validate_html()
self.validate_answer_references()
self.validate_list_references()

Expand Down Expand Up @@ -167,6 +175,161 @@ def validate_smart_quotes(self):
pointer=translatable_item.pointer,
)

def validate_html(self):
"""Validates HTML in translatable schema text.
Checks tags, entities, <br> whitespace, and <p> positioning.
"""
for translatable_item in get_translatable_items(self.schema_element): # type: ignore
schema_text = translatable_item.value
values_to_check = [schema_text]

if isinstance(schema_text, dict):
values_to_check = schema_text.values()

for text in values_to_check:
if not isinstance(text, str) or not text:
continue

if "<" in text and ">" in text:
self.validate_html_tags(text, translatable_item.pointer)
self.validate_whitespace_before_br_tag(text, translatable_item.pointer)
self.validate_p_tag_position(text, translatable_item.pointer)

if "&" in text and ";" in text:
self.validate_html_entities(text, translatable_item.pointer)

def validate_html_tags(self, text, pointer):
"""Validates HTML tags.

Args:
text (str): The text to be validated for HTML tags.
pointer (str): The JSON pointer indicating the location of the text in the questionnaire schema, used for
error reporting.
"""
allowed_tags = {"p", "strong", "a", "b", "br", "img"}
self_closing_tags = {"br", "img"}

tag_matches = re.finditer(r"</?([a-zA-Z0-9]+)[^>]*>", text)
stack = []

for match in tag_matches:
raw_tag = match.group(0)
tag_name = match.group(1).lower()

is_closing = raw_tag.startswith("</")
is_self_closing = raw_tag.endswith("/>") or tag_name in self_closing_tags

if tag_name not in allowed_tags:
self.add_error(
error_messages.INVALID_HTML_FOUND,
pointer=pointer,
text=text,
)
return

if is_closing:
if tag_name in self_closing_tags or not stack or stack[-1] != tag_name:
self.add_error(
error_messages.INVALID_HTML_FOUND,
pointer=pointer,
text=text,
)
return

stack.pop()

elif not is_self_closing:
stack.append(tag_name)

if stack:
self.add_error(
error_messages.INVALID_HTML_FOUND,
pointer=pointer,
text=text,
)

def is_valid_html_entity(self, entity):
"""Checks whether a given HTML entity is valid.
Supports both numeric (decimal and hexadecimal) and named entities.

Args:
entity (str): The HTML entity to validate (e.g. "&amp;", "&#169;").
"""
if entity.startswith("&#") and entity.endswith(";"):
numeric = entity[2:-1]

is_hex = numeric.lower().startswith("x")

try:
numeric_value = int(numeric[1:], 16) if is_hex else int(numeric)
except ValueError:
return False

return 0 <= numeric_value <= 0x10FFFF

if entity.startswith("&") and entity.endswith(";"):
return entity[1:] in html.entities.html5

return False

def validate_html_entities(self, text, pointer):
"""Validates HTML entities found in the text.

Extracts all entities and checks whether each one is valid.

Args:
text (str): The text to validate for HTML entities.
pointer (str): JSON pointer to the location of the text in the schema.
"""
entity_matches = re.findall(r"&[^;\s]+;", text)

for entity in entity_matches:
if not self.is_valid_html_entity(entity):
self.add_error(
error_messages.INVALID_HTML_ENTITIES_FOUND,
pointer=pointer,
text=text,
)
return

def validate_whitespace_before_br_tag(self, text, pointer):
"""Checks for invalid whitespace before <br> tags.

Args:
text (str): The text to validate.
pointer (str): JSON pointer to the location of the text in the schema.
"""
if re.search(r"\s+<br\s*/?>", text):
self.add_error(
error_messages.SPACE_BEFORE_BR,
pointer=pointer,
text=text,
)

def validate_p_tag_position(self, text, pointer):
"""Checks if p tag is at the start of a sentence
(ignoring whitespace and wrapper characters like [] and ()).

Args:
text (str): The text to validate.
pointer (str): JSON pointer to the location of the text in the schema.
"""
match = re.search(r"<p(?=[\s>])[^>]*>", text)

if not match:
return

text_before_p_tag = text[: match.start()]

content_before_p_tag = text_before_p_tag.strip(" \t\n\r[]()")

if content_before_p_tag:
self.add_error(
error_messages.INVALID_HTML_FOUND,
pointer=pointer,
text=text,
)

def validate_white_spaces(self):
"""Validate that there are no leading, trailing or multiple consecutive white spaces in the translatable text
of the questionnaire schema.
Expand Down
133 changes: 133 additions & 0 deletions tests/schemas/invalid/test_invalid_html_tags.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"mime_type": "application/json/ons/eq",
"language": "en",
"schema_version": "0.0.1",
"data_version": "0.0.3",
"survey_id": "144",
"theme": "default",
"title": "Test invalid html",
"legal_basis": "Notice is given under section 999 of the Test Act 2000",
"metadata": [
{
"name": "user_id",
"type": "string"
},
{
"name": "period_id",
"type": "string"
},
{
"name": "ru_name",
"type": "string"
},
{
"name": "ru_ref",
"type": "string"
},
{
"name": "trad_as",
"type": "string",
"optional": true
}
],
"questionnaire_flow": {
"type": "Linear",
"options": {}
},
"sections": [
{
"id": "introduction-section",
"title": "Introduction <br>",
"groups": [
{
"id": "introduction-group",
"title": "<p>General Business Information",
"blocks": [
{
"id": "introduction",
"type": "Introduction",
"primary_content": [
{
"id": "business-details",
"title": "Introduction with &fake; valid and invalid HTML",
"contents": [
{
"guidance": {
"contents": [
{
"title": "<invalid>Coronavirus (COVID-19) guidance</invalid>",
"description": "<strong>Explain your figures</strong> in the comment section to minimise us &amp; contacting you and to help us tell an industry story"
}
]
}
}
]
}
]
},
{
"type": "Interstitial",
"id": "intersitital-one",
"content": {
"title": "Page <p>with invalid html</p> in title",
"contents": [
{
"description": "[<p>You have successfully completed this section</p>]"
}
]
}
},
{
"type": "Interstitial",
"id": "interstitial-two",
"content": {
"title": "Page with link",
"contents": [
{
"description": "<a href='link'>Anchor</a>"
}
]
}
},
{
"type": "Interstitial",
"id": "interstitial-three",
"content": {
"title": "<strong>Page with mixed invalid tags</strong>",
"contents": [
{
"description": "<h1>Title</h1><em>Not valid tag</em>"
}
]
}
},
{
"type": "Interstitial",
"id": "interstitial-four",
"content": {
"title": "Valid double <strong>strong</strong> with another <strong>strong</strong>.",
"contents": [
{
"description": "<strong>Title</strong><em>Not valid tag</em>"
}
]
}
},
{
"type": "Interstitial",
"id": "interstitial-five",
"content": {
"title": "Valid double anchor.",
"contents": [
{
"description": "<a href='link'>Title</a> and <a href='link-2'>valid tag</a>"
}
]
}
}
]
}
]
}
]
}
46 changes: 46 additions & 0 deletions tests/test_questionnaire_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,52 @@ def test_invalid_whitespaces_in_schema():
assert validator.errors == expected_error_messages


def test_invalid_html_in_schema():
filename = "schemas/invalid/test_invalid_html_tags.json"
validator = QuestionnaireValidator(_open_and_load_schema_file(filename))

expected_error_messages = [
{
"message": error_messages.SPACE_BEFORE_BR,
"pointer": "/sections/0/title",
"text": "Introduction <br>",
},
{
"message": error_messages.INVALID_HTML_FOUND,
"pointer": "/sections/0/groups/0/title",
"text": "<p>General Business Information",
},
{
"message": error_messages.INVALID_HTML_FOUND,
"pointer": "/sections/0/groups/0/blocks/1/content/title",
"text": "Page <p>with invalid html</p> in title",
},
{
"message": error_messages.INVALID_HTML_FOUND,
"pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description",
"text": "<h1>Title</h1><em>Not valid tag</em>",
},
{
"message": error_messages.INVALID_HTML_FOUND,
"pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description",
"text": "<strong>Title</strong><em>Not valid tag</em>",
},
{
"message": error_messages.INVALID_HTML_ENTITIES_FOUND,
"pointer": "/sections/0/groups/0/blocks/0/primary_content/0/title",
"text": "Introduction with &fake; valid and invalid HTML",
},
{
"message": error_messages.INVALID_HTML_FOUND,
"pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title",
"text": "<invalid>Coronavirus (COVID-19) guidance</invalid>",
},
]
validator.validate_html()

assert validator.errors == expected_error_messages


def test_invalid_answer_type_for_question_summary_concatenation():
filename = "schemas/invalid/test_invalid_answer_type_for_question_summary.json"

Expand Down
Loading