-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnormalize.py
More file actions
77 lines (60 loc) · 2.77 KB
/
normalize.py
File metadata and controls
77 lines (60 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
PPTX normalization helper for determinism testing.
A raw PPTX is a ZIP archive containing XML files and metadata. Two renders of
the same spec will produce byte-different ZIPs because:
1. ZIP entry timestamps reflect file creation time
2. docProps/core.xml contains dcterms:created and dcterms:modified timestamps
3. docProps/app.xml may contain application version info
4. Shape IDs may be assigned incrementally (stable across runs but worth checking)
For determinism testing we extract the slide XML content (ppt/slides/slide*.xml
and ppt/slideLayouts/*, ppt/slideMasters/*) and hash that. Everything that
affects what the slide LOOKS LIKE goes into the hash. Metadata that doesn't
affect appearance is excluded.
"""
from __future__ import annotations
import hashlib
import io
import re
import zipfile
# Regexes to strip volatile attributes from XML
_TIMESTAMP_RE = re.compile(r'<dcterms:created[^>]*>[^<]*</dcterms:created>')
_MODIFIED_RE = re.compile(r'<dcterms:modified[^>]*>[^<]*</dcterms:modified>')
_LAST_MODIFIED_BY_RE = re.compile(r'<cp:lastModifiedBy[^>]*>[^<]*</cp:lastModifiedBy>')
_REVISION_RE = re.compile(r'<cp:revision[^>]*>[^<]*</cp:revision>')
def _strip_volatile(xml: str) -> str:
"""Remove volatile timestamp / metadata fields from an XML string."""
xml = _TIMESTAMP_RE.sub('', xml)
xml = _MODIFIED_RE.sub('', xml)
xml = _LAST_MODIFIED_BY_RE.sub('', xml)
xml = _REVISION_RE.sub('', xml)
return xml
def pptx_content_hash(pptx_bytes: bytes) -> str:
"""
Compute a stable hash of a PPTX's visible content.
Extracts all files under ppt/ (slide XML, layouts, themes, masters) from
the PPTX zip, strips volatile metadata, sorts by name, and returns a
SHA-256 hex digest. Two renders of the same spec should produce the
same hash.
Metadata files (docProps/*, _rels/* at the top level) are excluded
because they contain timestamps that legitimately vary.
"""
hasher = hashlib.sha256()
with zipfile.ZipFile(io.BytesIO(pptx_bytes)) as zf:
# Sort entries by name for stable ordering
entries = sorted(
name for name in zf.namelist()
if name.startswith("ppt/") and name.endswith(".xml")
)
for name in entries:
content = zf.read(name).decode("utf-8", errors="replace")
content = _strip_volatile(content)
# Include the filename in the hash so reordering is detected
hasher.update(name.encode("utf-8"))
hasher.update(b"\0")
hasher.update(content.encode("utf-8"))
hasher.update(b"\0")
return hasher.hexdigest()
def pptx_content_hash_from_path(path: str) -> str:
"""Convenience wrapper that reads a PPTX file from disk."""
with open(path, "rb") as f:
return pptx_content_hash(f.read())