Skip to content

Commit d8303a6

Browse files
authored
feat: implement strip mode (#13)
* chore: ignore tests in coverage * feat: implement strip mode Strips all `.py{c,o}`, `RECORD`, `direct_url.json`, and `__pycache__` files to ensure reproducible builds. * feat: add tests for new `strip` mode * chore: update README
1 parent 5a3bc5b commit d8303a6

5 files changed

Lines changed: 80 additions & 16 deletions

File tree

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,26 @@ One of the following must be specified:
3838
- `--output-dir`: The output directory for the final zip file. The name of the zip file will be based on the project's
3939
name in the `pyproject.toml` file (with dashes replaced with underscores).
4040

41-
## A Note on Reproducibility
41+
## Notes on Reproducibility
42+
43+
### Timestamps
4244

4345
The ZIP files generated adhere with [reproducible builds](https://reproducible-builds.org/docs/archives/). This means that file permissions and timestamps are modified inside the ZIP, such that the ZIP will have a deterministic hash. By default, the date is set to `1980-01-01`.
4446

4547
Additionally, the tool respects the standardized `$SOURCE_DATE_EPOCH` [environment variable](https://reproducible-builds.org/docs/source-date-epoch/), which will allow you to set that date as needed.
4648

4749
One important caveat is that ZIP files do not support files with timestamps earlier than `1980-01-01` inside them, due to MS-DOS compatibility. Therefore, the tool will throw a `SourceDateEpochError` is `$SOURCE_DATE_EPOCH` is below `315532800`.
50+
51+
### Files with embedded full paths
52+
53+
In testing, we found that several file types can leak information from the machine that generated the virtual environment.
54+
55+
To get around this, the tool removes the following files:
56+
57+
```gitignore
58+
**/__pycache/
59+
**/*.dist-info/direct_url.json
60+
**/*.dist-info/RECORD
61+
**/*.pyc
62+
**/*.pyo
63+
```

package_python_function/packager.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313

1414
class Packager:
1515
AWS_LAMBDA_MAX_UNZIP_SIZE = 262_144_000
16+
DIRS_TO_EXCLUDE = ["__pycache__"]
17+
DIST_INFO_FILES_TO_EXCLUDE = ["RECORD", "direct_url.json"]
18+
EXTENSIONS_TO_EXCLUDE = [".pyc", ".pyo"]
1619

1720
def __init__(self, venv_path: Path, project_path: Path, output_dir: Path, output_file: Path | None):
1821
self.project = PythonProject(project_path)
@@ -45,10 +48,18 @@ def zip_all_dependencies(self, target_path: Path) -> None:
4548
def zip_dir(path: Path) -> None:
4649
for item in path.iterdir():
4750
if item.is_dir():
48-
zip_dir(item)
51+
if item.name not in self.DIRS_TO_EXCLUDE:
52+
zip_dir(item)
4953
else:
50-
self._uncompressed_bytes += item.stat().st_size
51-
zip_file.write_reproducibly(item, item.relative_to(self.input_path))
54+
is_excluded_by_extension = item.suffix in self.EXTENSIONS_TO_EXCLUDE
55+
is_excluded_dist_info_file = (
56+
item.name in self.DIST_INFO_FILES_TO_EXCLUDE
57+
if item.parent.name.endswith(".dist-info")
58+
else False
59+
)
60+
if not (is_excluded_by_extension or is_excluded_dist_info_file):
61+
self._uncompressed_bytes += item.stat().st_size
62+
zip_file.write_reproducibly(item, item.relative_to(self.input_path))
5263

5364
zip_dir(self.input_path)
5465

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ addopts = """
3636
--capture=tee-sys
3737
"""
3838

39+
[tool.coverage.run]
40+
branch = true
41+
omit = ["*/tests/**"]
42+
3943
[tool.pipx-install]
4044
poetry = "==2.1.1"
4145
poethepoet = "==0.33.1"

tests/conftest.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from dataclasses import dataclass
23
from pathlib import Path
34
from typing import Self
@@ -24,6 +25,7 @@ def new(cls, path: str, contents: str = "") -> Self:
2425

2526
@dataclass
2627
class Data:
28+
files_excluded_from_bundle: list[File] # relative to packages_dir
2729
project_files: list[File] # relative to packages_dir
2830
pyproject: PythonProject
2931
python_version: str
@@ -34,10 +36,12 @@ def new(
3436
cls,
3537
project_name: str,
3638
project_files: list[File],
39+
files_excluded_from_bundle: list[File],
3740
python_version: str = "3.13",
3841
) -> Self:
3942
pyproject = _new_python_project(name=project_name)
4043
return cls(
44+
files_excluded_from_bundle=files_excluded_from_bundle,
4145
project_files=project_files,
4246
pyproject=pyproject,
4347
python_version=python_version,
@@ -86,25 +90,47 @@ def verify_file_reproducibility(file_info: list[ZipInfo], expected_file_date_tim
8690
assert info.date_time == expected_file_date_time
8791

8892
@pytest.fixture
89-
def test_data(tmp_path: Path):
93+
def test_files(tmp_path: Path):
94+
files_excluded_from_bundle = [
95+
File.new("__pycache__/_virtualenv.cpython-313.pyc"),
96+
File.new("project_1.dist-info/RECORD"),
97+
File.new("project_1.dist-info/direct_url.json", contents=json.dumps({"url": str(tmp_path)})),
98+
]
9099
files = [
91100
File.new("project_1/__init__.py"),
92101
File.new("project_1/project1.py"),
102+
File.new("project_1.dist-info/METADATA"),
93103
File.new("small_dependency/__init__.py"),
94104
File.new("small_dependency/small_dependency.py", "# This is a small dependency"),
105+
*files_excluded_from_bundle,
95106
]
96-
data = Data.new(project_name="project-1", project_files=files).commit(loc=tmp_path)
97-
yield data
107+
yield files, files_excluded_from_bundle, tmp_path
98108

99109
@pytest.fixture
100-
def test_data_nested(tmp_path: Path):
101-
files = [
102-
File.new("project_1/__init__.py"),
103-
File.new("project_1/project1.py"),
104-
File.new("small_dependency/__init__.py"),
105-
File.new("small_dependency/small_dependency.py", "# This is a small dependency"),
110+
def test_files_nested(test_files):
111+
files, files_excluded_from_bundle, tmp_path = test_files
112+
big_files = [
106113
File.new("gigantic_dependency/__init__.py"),
107114
File.new("gigantic_dependency/gigantic.py", "a" * Packager.AWS_LAMBDA_MAX_UNZIP_SIZE),
108115
]
109-
data = Data.new(project_name="project-1", project_files=files).commit(loc=tmp_path)
116+
yield [*files, *big_files], files_excluded_from_bundle, tmp_path
117+
118+
@pytest.fixture
119+
def test_data(test_files):
120+
files, files_excluded_from_bundle, loc = test_files
121+
data = Data.new(
122+
project_name="project-1",
123+
project_files=files,
124+
files_excluded_from_bundle=files_excluded_from_bundle,
125+
).commit(loc=loc)
126+
yield data
127+
128+
@pytest.fixture
129+
def test_data_nested(test_files_nested):
130+
files, files_excluded_from_bundle, loc = test_files_nested
131+
data = Data.new(
132+
project_name="project-1-nested",
133+
project_files=files,
134+
files_excluded_from_bundle=files_excluded_from_bundle,
135+
).commit(loc=loc)
110136
yield data

tests/test_package_python_function.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,10 @@ def test_package_python_function(
6767
verify_file_reproducibility(zip.infolist(), expected_file_date_time=expected_date_time)
6868

6969
for file in test_data.project_files:
70-
assert (verify_dir / file.path).exists()
70+
if file in test_data.files_excluded_from_bundle:
71+
assert not (verify_dir / file.path).exists()
72+
else:
73+
assert (verify_dir / file.path).exists()
7174

7275
@pytest.mark.parametrize(
7376
"src_epoch, expected_exception, expected_date_time",
@@ -131,5 +134,9 @@ def test_package_python_function_nested(
131134
with zipfile.ZipFile(inner_zip, "r") as izip:
132135
izip.extractall(verify_dir)
133136
verify_file_reproducibility(izip.infolist(), expected_file_date_time=expected_date_time)
137+
134138
for file in test_data_nested.project_files:
135-
assert (verify_dir / file.path).exists()
139+
if file in test_data_nested.files_excluded_from_bundle:
140+
assert not (verify_dir / file.path).exists()
141+
else:
142+
assert (verify_dir / file.path).exists()

0 commit comments

Comments
 (0)