Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pip install -r requirements-dev.txt
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
36 changes: 34 additions & 2 deletions data_url/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
(?P<MIME>[a-z][a-z0-9\-]+/[a-z][\w\-\.\+]+)? # optional media type
(?P<parameters>(?:;[\w\-\.+]+=[\w\-\.+%]+)*) # optional attribute=values, value can be url encoded
(?P<encoded>;base64)?, # optional base64 flag
(?P<data>[\w\d.~%\=\/\+-]+) # the data
(?P<data>.*) # data section - validate separately
""",
re.MULTILINE | re.VERBOSE
)
Expand Down Expand Up @@ -126,11 +126,23 @@ def __parse_url(self):
for pair in params.split(";"):
if pair:
name, value = pair.split("=", 1)
# base64 is reserved and can only appear as a flag, not as a parameter
if name == "base64":
return False
self._parameters[name] = unquote(value)

raw_data = match.group('data')

# Validate the data section contains only allowed characters
if not _validate_data_section(raw_data, self._is_base64_encoded):
return False

if self._is_base64_encoded:
self._data = base64.b64decode(raw_data)
try:
self._data = base64.b64decode(raw_data)
except Exception:
# Invalid base64 data
return False
else:
self._data = raw_data
return True
Expand Down Expand Up @@ -181,3 +193,23 @@ def parameters(self):
if not hasattr(self, '_parameters'):
self._parameters = {}
return self._parameters

def _validate_data_section(data, is_base64=False):
"""
Validate that the data section contains only allowed characters.

Args:
data (str): The data section to validate
is_base64 (bool): Whether this is base64 encoded data

Returns:
bool: True if valid, False otherwise
"""
if is_base64:
# Base64 alphabet plus padding
base64_pattern = re.compile(r'^[A-Za-z0-9+/=]*$')
return base64_pattern.match(data) is not None
else:
# Unreserved characters plus percent-encoded sequences
unreserved_pattern = re.compile(r'^(?:[A-Za-z0-9\-_.!~*\'()]|%[0-9A-Fa-f]{2})*$')
return unreserved_pattern.match(data) is not None
3 changes: 3 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pytest
pyyaml
flake8
165 changes: 165 additions & 0 deletions test/data_url_test_cases.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
test_cases:
# Valid test cases (20 passing)

- url: "data:text/plain,Hello%20World"
valid: true
description: "Basic text with URL-encoded space"

- url: "data:text/plain,Hello-World_123"
valid: true
description: "Text with unreserved characters"

- url: "data:text/plain,Hello.World"
valid: true
description: "Text with dot character"

- url: "data:text/plain,Hello!World"
valid: true
description: "Text with exclamation mark"

- url: "data:text/plain,Hello~World"
valid: true
description: "Text with tilde character"

- url: "data:text/plain,Hello*World"
valid: true
description: "Text with asterisk character"

- url: "data:text/plain,Hello%27World"
valid: true
description: "Text with URL-encoded single quote"

- url: "data:text/plain,Hello%28World%29"
valid: true
description: "Text with URL-encoded parentheses"

- url: "data:text/plain;base64,SGVsbG8gV29ybGQ="
valid: true
description: "Base64 encoded text"

- url: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
valid: true
description: "Base64 encoded PNG image"

- url: "data:application/json;charset=utf-8,Hello%20World"
valid: true
description: "JSON MIME type with charset parameter"

- url: "data:text/plain,Hello%7BWorld%7D"
valid: true
description: "Text with URL-encoded curly braces"

- url: "data:text/plain,Hello%5BWorld%5D"
valid: true
description: "Text with URL-encoded square brackets"

- url: "data:text/plain,Hello%3AWorld"
valid: true
description: "Text with URL-encoded colon"

- url: "data:text/plain,Hello%40World"
valid: true
description: "Text with URL-encoded at symbol"

- url: "data:text/plain,Hello%2FWorld"
valid: true
description: "Text with URL-encoded forward slash"

- url: "data:text/plain,Hello%23World"
valid: true
description: "Text with URL-encoded hash"

- url: "data:text/plain,Hello%22World%22"
valid: true
description: "Text with URL-encoded double quotes"

- url: "data:text/plain,"
valid: true
description: "Empty data section"

- url: "data:,Hello%20World"
valid: true
description: "Missing MIME type"


# Invalid test cases (20 failing)

- url: "data:text/plain,Hello World"
valid: false
description: "Unencoded space character"

- url: "data:text/plain,Hello{World}"
valid: false
description: "Unencoded curly braces"

- url: "data:text/plain,Hello[World]"
valid: false
description: "Unencoded square brackets"

- url: "data:text/plain,Hello:World"
valid: false
description: "Unencoded colon"

- url: "data:text/plain,Hello@World"
valid: false
description: "Unencoded at symbol"

- url: "data:text/plain,Hello/World"
valid: false
description: "Unencoded forward slash"

- url: "data:text/plain,Hello#World"
valid: false
description: "Unencoded hash"

- url: "data:text/plain,Hello\"World\""
valid: false
description: "Unencoded double quotes"

- url: "data:text/plain,Hello<World>"
valid: false
description: "Unencoded angle brackets"

- url: "data:text/plain,Hello=World"
valid: false
description: "Unencoded equals sign"

- url: "data:TEXT/plain,Hello"
valid: false
description: "MIME type with uppercase letters"

- url: "data:text/PLAIN,Hello"
valid: false
description: "MIME subtype with uppercase letters"

- url: "data:text,Hello"
valid: false
description: "MIME type without subtype"

- url: "data:/plain,Hello"
valid: false
description: "MIME type starts with slash"

- url: "data:text/,Hello"
valid: false
description: "Empty MIME subtype"

- url: "data:text/plain;base64;charset=utf-8,Hello"
valid: false
description: "Parameters after base64 flag are not allowed"

- url: "data:text/plain;base64=true,Hello"
valid: false
description: "Base64 flag with value is not allowed"

- url: "data:text/plain;BASE64,Hello"
valid: false
description: "Base64 flag with uppercase letters"

- url: "data:text/plain"
valid: false
description: "Missing comma separator and data"

- url: "DATA:text/plain,Hello"
valid: false
description: "Uppercase DATA scheme is not allowed"
21 changes: 21 additions & 0 deletions test/test_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import unittest
import yaml
import os
from data_url import DataURL

class TestDataURLRegex(unittest.TestCase):
def setUp(self):
# Load test cases from YAML file
test_file = os.path.join(os.path.dirname(__file__), 'data_url_test_cases.yaml')
with open(test_file, 'r') as f:
self.test_cases = yaml.safe_load(f)['test_cases']

def test_data_url_regex(self):
for case in self.test_cases:
url = case['url']
expected_valid = case['valid']

data_url = DataURL.from_url(url)
actual_valid = data_url is not None
with self.subTest(url=url, description=case['description']):
self.assertEqual(actual_valid, expected_valid, f"URL: {url}\nDescription: {case['description']}")