Skip to content

Commit 0bac8ad

Browse files
committed
feat: scan source distributions for compiled code
The new helper function `scan_compiled_extensions` scans source distributions for compiled code. It detects common extensions like `.so` and `.dylib` as well as files with certain headers. The function is designed to detect packaging issues like sdists with pre-compiled code. It is incapable of detecting supply chain attacks and malicious code. Signed-off-by: Christian Heimes <cheimes@redhat.com>
1 parent c1c9de6 commit 0bac8ad

2 files changed

Lines changed: 110 additions & 0 deletions

File tree

src/fromager/sources.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,9 @@ def build_sdist(
607607
sdist_root_dir=sdist_root_dir,
608608
build_env=build_env,
609609
)
610+
# look for compiled code in sdist
611+
scan_compiled_extensions(sdist_root_dir)
612+
610613
if req.url:
611614
# The default approach to making an sdist is to make a tarball from the
612615
# source directory, since most of the time we got the source directory
@@ -775,3 +778,80 @@ def validate_sdist_filename(
775778
dist_name=sdist_name,
776779
dist_version=sdist_version,
777780
)
781+
782+
783+
_EXTENSION_SUFFIXES: set[str] = (
784+
".so", # Linux, BSD
785+
".dylib", # macOS
786+
".pyd", # Windows
787+
".dll", # Windows
788+
".exe", # Windows
789+
)
790+
791+
# ignore Python, configs, C, C++, CUDA, Go, Rust, text files
792+
_IGNORE_SUFFIXES: set[str] = {
793+
".c",
794+
".cc",
795+
".cu",
796+
".go",
797+
".h",
798+
".ini",
799+
".md",
800+
".py",
801+
".rs",
802+
".rst",
803+
".sh",
804+
".toml",
805+
".txt",
806+
".yaml",
807+
}
808+
809+
_MAGIC_HEADERS: tuple[bytes] = (
810+
b"\x7fELF", # Linux, BSD ELF
811+
b"MZ", # Windows executable
812+
b"\xfe\xed\xfa\xcf", # macOS 64-bit
813+
b"\xfe\xed\xfa\xce", # macOS 32-bit
814+
b"\xca\xfe\xba\xbe", # macOS universal
815+
)
816+
817+
818+
def scan_compiled_extensions(
819+
root_dir: pathlib.Path,
820+
*,
821+
extension_suffixes: set[str] = _EXTENSION_SUFFIXES,
822+
ignore_suffixes: set[str] = _IGNORE_SUFFIXES,
823+
warn: bool = True,
824+
) -> list[pathlib.Path]:
825+
"""Scan directory tree for compiled code
826+
827+
Detect files that have an extension suffix or magic header.
828+
829+
.. warning::
830+
831+
The function is not designed to detect supply chain attacks or
832+
malicious code. It's merely a helper to detect packaging issues.
833+
"""
834+
issues: list[pathlib.Path] = []
835+
for directory, _, filenames in root_dir.walk():
836+
for filename in filenames:
837+
filepath = directory / filename
838+
suffix = filepath.suffix
839+
if suffix in extension_suffixes:
840+
if warn:
841+
logger.warning(
842+
"file %s has a binary extension suffix",
843+
filepath.relative_to(root_dir),
844+
)
845+
issues.append(filepath)
846+
elif suffix not in ignore_suffixes:
847+
with filepath.open("rb") as f:
848+
header = f.read(4)
849+
if header.startswith(_MAGIC_HEADERS):
850+
if warn:
851+
logger.warning(
852+
"file %s starts with an executable file magic header: %r",
853+
filepath.relative_to(root_dir),
854+
header,
855+
)
856+
issues.append(filepath)
857+
return issues

tests/test_sources.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pathlib
2+
import sys
23
import typing
34
from unittest.mock import Mock, patch
45

@@ -218,3 +219,32 @@ def test_validate_sdist_file(
218219
else:
219220
with pytest.raises(ValueError):
220221
sources.validate_sdist_filename(req, version, sdist_file)
222+
223+
224+
# read header of Python executable
225+
with open(sys.executable, "rb") as _f:
226+
_EXEC_HEADER = _f.read(8)
227+
228+
229+
@pytest.mark.parametrize(
230+
"filename,content,hit",
231+
[
232+
("test.py", b"#!/usr/bin/python", False),
233+
("test.so", b"ignore", True),
234+
("test", _EXEC_HEADER, True),
235+
# assume that packages do not disguise compiled code as .py files.
236+
# A malicious actor can use more elaborate tricks to hide bad code.
237+
("test.py", _EXEC_HEADER, False),
238+
],
239+
)
240+
def test_scan_compiled_extensions(
241+
filename: str, content: bytes, hit: bool, tmp_path: pathlib.Path
242+
) -> None:
243+
filepath = tmp_path / filename
244+
with filepath.open("wb") as f:
245+
f.write(content)
246+
matches = sources.scan_compiled_extensions(tmp_path)
247+
if hit:
248+
assert matches == [filepath]
249+
else:
250+
assert matches == []

0 commit comments

Comments
 (0)