diff --git a/packages/docx-core/src/baselines/atomizer/pipeline.field-validation.test.ts b/packages/docx-core/src/baselines/atomizer/pipeline.field-validation.test.ts new file mode 100644 index 0000000..c8f4760 --- /dev/null +++ b/packages/docx-core/src/baselines/atomizer/pipeline.field-validation.test.ts @@ -0,0 +1,253 @@ +import { describe, expect } from 'vitest'; +import { testAllure, type AllureBddContext } from '../../testing/allure-test.js'; +import { validateFieldStructure } from './pipeline.js'; + +const test = testAllure + .epic('Document Comparison') + .withLabels({ feature: 'Field Structure Validation (ECMA-376)' }); + +const NS = 'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"'; + +function buildDoc(bodyXml: string): string { + return ( + `` + + `` + + `${bodyXml}` + ); +} + +const COMPLETE_FIELD = + `` + + ` NUMPAGES ` + + `` + + `3` + + ``; + +// ECMA-376 conformant field-modification pattern: a field whose instruction +// text is changing under track changes. The fldChars remain UNWRAPPED at the +// sibling-run level (they cannot enter ), while the changed instrText +// fragments into / wrappers. Research summary: c-rex ECMA-376 +// Part 4 fldChar topic + DeletedFieldCode placement constraint. +const MODIFIED_FIELD_FRAGMENTED = + `` + + ` NUMPAGES ` + + ` PAGE ` + + `` + + `3` + + ``; + +describe('validateFieldStructure', () => { + test( + 'field-free document is valid', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = false; + + await given('a document with only literal text runs', () => { + xml = buildDoc(`hello`); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it passes', () => { + expect(ok).toBe(true); + }); + }, + ); + + test( + 'complete NUMPAGES field is valid', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = false; + + await given('a paragraph containing a well-formed NUMPAGES complex field', () => { + xml = buildDoc(`${COMPLETE_FIELD}`); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it passes', () => { + expect(ok).toBe(true); + }); + }, + ); + + test( + 'orphan w:instrText outside any field is rejected', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = true; + + await given('a paragraph with a bare w:instrText and no surrounding w:fldChar', () => { + xml = buildDoc(` PAGE `); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it is rejected', () => { + expect(ok).toBe(false); + }); + }, + ); + + test( + 'w:instrText after the separator (in the result section) is rejected', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = true; + + await given('a field whose w:instrText is placed AFTER w:fldChar separate', () => { + xml = buildDoc( + `` + + `` + + `` + + ` NUMPAGES ` + + `` + + ``, + ); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it is rejected', () => { + expect(ok).toBe(false); + }); + }, + ); + + test( + 'unbalanced begin/end counts are rejected', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = true; + + await given('a field with two begins and only one end', () => { + xml = buildDoc( + `` + + `` + + `` + + `` + + ``, + ); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it is rejected', () => { + expect(ok).toBe(false); + }); + }, + ); + + test( + 'ECMA-376 fragmented field modification (unwrapped fldChars + ins/del instrText) is valid', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = false; + + await given( + 'a field where w:fldChar markers remain unwrapped while w:instrText/w:delInstrText fragment into /', + () => { + xml = buildDoc(`${MODIFIED_FIELD_FRAGMENTED}`); + }, + ); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it passes', () => { + expect(ok).toBe(true); + }); + }, + ); + + test( + 'w:delInstrText outside is rejected (ECMA-376 DeletedFieldCode)', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = true; + + await given('a field where w:delInstrText appears in a run NOT wrapped by ', () => { + xml = buildDoc( + `` + + `` + + ` NUMPAGES ` + + `` + + `` + + ``, + ); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it is rejected', () => { + expect(ok).toBe(false); + }); + }, + ); + + test( + 'w:delInstrText inside but outside any field body is rejected', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = true; + + await given('a wrapping w:delInstrText with no enclosing field begin/separate', () => { + xml = buildDoc( + `` + + ` NUMPAGES ` + + ``, + ); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it is rejected', () => { + expect(ok).toBe(false); + }); + }, + ); + + test( + 'w:fldChar nested inside is rejected (ECMA-376 fatal violation)', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = true; + + await given('a wrapping a balanced begin/end field-character pair', () => { + xml = buildDoc( + `` + + `` + + `` + + `` + + `` + + ``, + ); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it is rejected', () => { + expect(ok).toBe(false); + }); + }, + ); + + test( + 'w:fldChar inside is allowed (insertion of a new field is conformant)', + async ({ given, when, then }: AllureBddContext) => { + let xml = ''; + let ok = false; + + await given('an insertion wrapping a complete NUMPAGES field', () => { + xml = buildDoc(`${COMPLETE_FIELD}`); + }); + await when('the document is validated', () => { + ok = validateFieldStructure(xml); + }); + await then('it passes', () => { + expect(ok).toBe(true); + }); + }, + ); +}); diff --git a/packages/docx-core/src/baselines/atomizer/pipeline.ts b/packages/docx-core/src/baselines/atomizer/pipeline.ts index 6e4c903..aab4e4f 100644 --- a/packages/docx-core/src/baselines/atomizer/pipeline.ts +++ b/packages/docx-core/src/baselines/atomizer/pipeline.ts @@ -345,18 +345,27 @@ function buildFailureSummary( /** * Validate field structure integrity in document XML. * - * Checks that fldChar begin/end are balanced and that w:instrText only - * appears inside a proper field sequence (between begin and separate). - * Orphaned instrText elements render as visible text in Word. + * Enforces three ECMA-376 Part 4 constraints on complex fields: + * 1. Global `w:fldChar` begin/end count balance. + * 2. Every `w:instrText` AND `w:delInstrText` sits inside an open field body + * (between `begin` and `separate`). Orphaned instruction text renders as + * literal text in Word. + * 3. `w:delInstrText` is nested inside a `` ancestor (DeletedFieldCode + * schema constraint), and conversely `w:fldChar` is NEVER inside `` + * (Word treats this as fatal and discards the field state machine). + * + * Called on both pre-accept/reject combined XML (with track-change wrappers) + * and on post-accept/reject XML (wrappers removed). Both cases must satisfy the + * field placement check; constraint (3) is vacuous post-accept/reject. */ export function validateFieldStructure(documentXml: string): boolean { const root = parseDocumentXml(documentXml); - // Walk the document in order, tracking field nesting const allFldChars = findAllByTagName(root, 'w:fldChar'); const allInstrTexts = findAllByTagName(root, 'w:instrText'); + const allDelInstrTexts = findAllByTagName(root, 'w:delInstrText'); - // Quick balance check + // Constraint (1): global fldChar begin/end balance. let begins = 0; let ends = 0; for (const fc of allFldChars) { @@ -366,19 +375,36 @@ export function validateFieldStructure(documentXml: string): boolean { } if (begins !== ends) return false; - // Check that instrText elements are inside a field (between begin and separate). - // Walk all elements in document order using a recursive scan. - if (allInstrTexts.length === 0) return true; // No instrText, nothing to validate + if ( + allFldChars.length === 0 && + allInstrTexts.length === 0 && + allDelInstrTexts.length === 0 + ) { + return true; + } - // Depth-first scan to check instrText placement + // Depth-first scan tracking field nesting (for constraint 2) and + // ancestor nesting (for constraint 3). let depth = 0; - const pastSeparatorAtDepth: number[] = []; // track separator state per depth + const pastSeparatorAtDepth: number[] = []; + let insideDelDepth = 0; + function scan(node: Element): boolean { for (let child = node.firstChild; child; child = child.nextSibling) { - if (child.nodeType !== 1) continue; // skip non-elements + if (child.nodeType !== 1) continue; const el = child as Element; + const tag = el.tagName; + + if (tag === 'w:del') { + insideDelDepth++; + const ok = scan(el); + insideDelDepth--; + if (!ok) return false; + continue; + } - if (el.tagName === 'w:fldChar') { + if (tag === 'w:fldChar') { + if (insideDelDepth > 0) return false; const type = el.getAttribute('w:fldCharType'); if (type === 'begin') { depth++; @@ -388,8 +414,10 @@ export function validateFieldStructure(documentXml: string): boolean { } else if (type === 'end') { if (depth > 0) depth--; } - } else if (el.tagName === 'w:instrText') { - // instrText must be inside a field (depth > 0) and before the separator + } else if (tag === 'w:instrText') { + if (depth === 0 || pastSeparatorAtDepth[depth]) return false; + } else if (tag === 'w:delInstrText') { + if (insideDelDepth === 0) return false; if (depth === 0 || pastSeparatorAtDepth[depth]) return false; }