All files / src zip-canonical.ts

100% Statements 52/52
100% Branches 27/27
100% Functions 10/10
100% Lines 44/44

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223                                                                                                10x           10x                                                                                       31x 31x   6x                                 124x 116x 116x         8x                         15x 15x 12x 12x   10x 10x 64x 1x         9x 60x 60x 60x 2x           7x                             120x 120x 120x 119x 30x 30x   119x 119x               30x 30x 393x   83x       393x 393x 363x                 124x       8x 8x 283x 283x   8x    
/**
 * ZIP-archive canonicalization for xlsx/ods matchers.
 *
 * xlsx and ods are both ZIP archives containing XML entries (plus an
 * occasional binary image). For oracle-replay we want two archives
 * to compare equal when their semantic content matches, ignoring:
 *
 *   • file order in the zip stream
 *   • XML attribute order, insignificant whitespace, comments
 *     (handled by the linkedom-based canonicalizer from
 *     `html-canonical.ts`)
 *   • volatile metadata elements in `docProps/` (xlsx) and `meta.xml`
 *     (ods) that carry timestamps / authorship / tool version
 *
 * Volatile-element drop lists per file (explicit and documented here):
 *
 * xlsx `docProps/core.xml`
 *   - `<dcterms:created>`
 *   - `<dcterms:modified>`
 *   - `<cp:lastModifiedBy>`
 *   - `<cp:revision>` (rolls forward on every save)
 *
 * xlsx `docProps/app.xml`
 *   - `<AppVersion>` (rolls with the tool version)
 *   - `<TotalTime>` (edit-session clock)
 *
 * ods `meta.xml`
 *   - `<meta:creation-date>`
 *   - `<dc:date>`
 *   - `<meta:editing-duration>`
 *   - `<meta:editing-cycles>`
 *   - `<meta:generator>`
 *   - `<dc:creator>` (ods writes the OS username here)
 *
 * Non-XML binary entries (images under `xl/media/`, `Pictures/`, etc.)
 * are compared byte-for-byte — SocialCalc doesn't embed images so in
 * practice these slots are empty, but the code path is covered.
 *
 * Corrupted zips surface as an `HtmlParseError` with the fflate
 * error message so the matcher can report a meaningful diff.
 */
 
import { unzipSync } from 'fflate';
import { DOMParser } from 'linkedom';
 
import { HtmlParseError, normalizeDomNode } from './html-canonical.ts';
 
/** xlsx volatile element names keyed by path-within-zip. */
export const VOLATILE_XLSX_DOCPROPS: Readonly<Record<string, readonly string[]>> = {
  'docProps/core.xml': ['dcterms:created', 'dcterms:modified', 'cp:lastModifiedBy', 'cp:revision'],
  'docProps/app.xml': ['AppVersion', 'TotalTime'],
};
 
/** ods volatile element names keyed by path-within-zip. */
export const VOLATILE_ODS_META: Readonly<Record<string, readonly string[]>> = {
  'meta.xml': [
    'meta:creation-date',
    'dc:date',
    'meta:editing-duration',
    'meta:editing-cycles',
    'meta:generator',
    'dc:creator',
  ],
};
 
/** Shape accepted by the per-entry canonicalizer. */
interface VolatileMap {
  readonly [path: string]: readonly string[];
}
 
/** Minimal DOM element type we need — matches linkedom. */
interface DomElement {
  readonly childNodes: { readonly length: number; [i: number]: DomElement };
  readonly nodeType: number;
  readonly nodeName: string;
  removeChild(c: DomElement): unknown;
  readonly outerHTML: string;
}
 
/**
 * Result of comparing two archives. `equal` is the go/no-go; `diff`
 * carries a human-readable explanation of the first divergence we
 * found (e.g. missing entry, byte mismatch in sheet1.xml, etc.).
 */
export interface ZipCompareResult {
  readonly equal: boolean;
  readonly diff?: string;
}
 
/**
 * Unzip or return `{equal:false, diff:...}` describing the failure.
 * Separate from `compareZipArchives` so the caller can distinguish
 * "corrupt expected" from "corrupt actual" in the diff text.
 */
export function unzipOrError(bytes: Uint8Array, label: string): {
  readonly entries?: Readonly<Record<string, Uint8Array>>;
  readonly diff?: string;
} {
  try {
    return { entries: unzipSync(bytes) };
  } catch (err) {
    return { diff: `${label} is not a valid zip archive: ${(err as Error).message}` };
  }
}
 
/**
 * Canonicalize a single zip entry for comparison. Returns a string
 * that two equivalent entries must match on byte-for-byte.
 *
 * XML entries get DOM-normalized (attribute sort + whitespace trim +
 * volatile-element drop). Binary entries fall back to a base64-like
 * stable representation (hex string) so a text-based diff still works.
 */
export function canonicalizeZipEntry(
  path: string,
  bytes: Uint8Array,
  volatile: VolatileMap,
): string {
  if (isXmlPath(path)) {
    const text = new TextDecoder().decode(bytes);
    return canonicalizeXmlWithDrops(text, volatile[path] ?? []);
  }
  // Binary: emit a hex digest-ish stable string. We don't hash because
  // the matcher wants to report "where" the mismatch is, and hex gives
  // per-byte context if needed (still a terse diff for small blobs).
  return bytesToHex(bytes);
}
 
/**
 * Compare two archives. If any entry differs, returns a description
 * of the first divergence. Missing/extra entries are reported first
 * since they change the canonical set of paths.
 */
export function compareZipArchives(
  expected: Uint8Array,
  actual: Uint8Array,
  volatile: VolatileMap,
): ZipCompareResult {
  const e = unzipOrError(expected, 'expected body');
  if (e.diff) return { equal: false, diff: e.diff };
  const a = unzipOrError(actual, 'actual body');
  if (a.diff) return { equal: false, diff: a.diff };
 
  const ePaths = Object.keys(e.entries!).sort();
  const aPaths = Object.keys(a.entries!).sort();
  if (ePaths.length !== aPaths.length || ePaths.some((p, i) => p !== aPaths[i])) {
    return {
      equal: false,
      diff: `zip entry list differs:\n--- expected\n${ePaths.join('\n')}\n--- actual\n${aPaths.join('\n')}`,
    };
  }
  for (const path of ePaths) {
    const ec = canonicalizeZipEntry(path, e.entries![path]!, volatile);
    const ac = canonicalizeZipEntry(path, a.entries![path]!, volatile);
    if (ec !== ac) {
      return {
        equal: false,
        diff: `zip entry ${path} differs:\n--- expected\n${ec}\n--- actual\n${ac}`,
      };
    }
  }
  return { equal: true };
}
 
/**
 * Parse as XML, drop any element (at any depth) whose `nodeName`
 * matches one of `dropNames`, and re-serialize through
 * `normalizeDomNode` for whitespace/attribute normalization.
 *
 * Depth-walk matters because ODS buries its volatile metadata under
 * `<office:meta>` inside `<office:document-meta>`, not at the root.
 *
 * Throws on parse failure so the caller can tag it with the entry
 * path that failed.
 */
export function canonicalizeXmlWithDrops(raw: string, dropNames: readonly string[]): string {
  const doc = new DOMParser().parseFromString(raw, 'text/xml');
  const de = (doc as unknown as { documentElement: DomElementWithParent | null }).documentElement;
  if (!de) throw new HtmlParseError('linkedom could not parse xml as a rooted document');
  if (dropNames.length > 0) {
    const set = new Set(dropNames);
    removeMatchingDescendants(de, set);
  }
  normalizeDomNode(de as unknown as Parameters<typeof normalizeDomNode>[0]);
  return de.outerHTML;
}
 
/** Walk the tree rooted at `node`, removing any element whose `nodeName` matches `set`. */
function removeMatchingDescendants(
  node: DomElementWithParent,
  set: Set<string>,
): void {
  const toRemove: DomElementWithParent[] = [];
  walk(node, (el) => {
    if (el.nodeType === 1 && set.has(el.nodeName)) toRemove.push(el);
  });
  for (const el of toRemove) el.parentNode?.removeChild(el);
}
 
function walk(node: DomElementWithParent, visit: (n: DomElementWithParent) => void): void {
  visit(node);
  for (let i = 0; i < node.childNodes.length; i++) {
    walk(node.childNodes[i]! as DomElementWithParent, visit);
  }
}
 
interface DomElementWithParent extends DomElement {
  readonly parentNode?: DomElementWithParent | null;
}
 
function isXmlPath(path: string): boolean {
  return path.endsWith('.xml') || path.endsWith('.rels') || path === '[Content_Types].xml';
}
 
function bytesToHex(bytes: Uint8Array): string {
  let out = '';
  for (let i = 0; i < bytes.length; i++) {
    const b = bytes[i]!;
    out += (b < 16 ? '0' : '') + b.toString(16);
  }
  return out;
}