Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import { describe, expect } from 'vitest';
import { testAllure, type AllureBddContext } from '../../testing/allure-test.js';
import { validateFieldStructure } from './pipeline.js';

const test = testAllure
.epic('Document Comparison')
.withLabels({ feature: 'Field Structure Validation (ECMA-376)' });

const NS = 'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"';

function buildDoc(bodyXml: string): string {
return (
`<?xml version="1.0" encoding="UTF-8" standalone="yes"?>` +
`<w:document ${NS}>` +
`<w:body>${bodyXml}<w:sectPr/></w:body></w:document>`
);
}

const COMPLETE_FIELD =
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:r><w:instrText xml:space="preserve"> NUMPAGES </w:instrText></w:r>` +
`<w:r><w:fldChar w:fldCharType="separate"/></w:r>` +
`<w:r><w:t>3</w:t></w:r>` +
`<w:r><w:fldChar w:fldCharType="end"/></w:r>`;

// ECMA-376 conformant field-modification pattern: a field whose instruction
// text is changing under track changes. The fldChars remain UNWRAPPED at the
// sibling-run level (they cannot enter <w:del>), while the changed instrText
// fragments into <w:ins>/<w:del> wrappers. Research summary: c-rex ECMA-376
// Part 4 fldChar topic + DeletedFieldCode placement constraint.
const MODIFIED_FIELD_FRAGMENTED =
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:ins><w:r><w:instrText xml:space="preserve"> NUMPAGES </w:instrText></w:r></w:ins>` +
`<w:del><w:r><w:delInstrText xml:space="preserve"> PAGE </w:delInstrText></w:r></w:del>` +
`<w:r><w:fldChar w:fldCharType="separate"/></w:r>` +
`<w:r><w:t>3</w:t></w:r>` +
`<w:r><w:fldChar w:fldCharType="end"/></w:r>`;

describe('validateFieldStructure', () => {
test(
'field-free document is valid',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = false;

await given('a document with only literal text runs', () => {
xml = buildDoc(`<w:p><w:r><w:t>hello</w:t></w:r></w:p>`);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it passes', () => {
expect(ok).toBe(true);
});
},
);

test(
'complete NUMPAGES field is valid',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = false;

await given('a paragraph containing a well-formed NUMPAGES complex field', () => {
xml = buildDoc(`<w:p>${COMPLETE_FIELD}</w:p>`);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it passes', () => {
expect(ok).toBe(true);
});
},
);

test(
'orphan w:instrText outside any field is rejected',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = true;

await given('a paragraph with a bare w:instrText and no surrounding w:fldChar', () => {
xml = buildDoc(`<w:p><w:r><w:instrText> PAGE </w:instrText></w:r></w:p>`);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it is rejected', () => {
expect(ok).toBe(false);
});
},
);

test(
'w:instrText after the separator (in the result section) is rejected',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = true;

await given('a field whose w:instrText is placed AFTER w:fldChar separate', () => {
xml = buildDoc(
`<w:p>` +
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:r><w:fldChar w:fldCharType="separate"/></w:r>` +
`<w:r><w:instrText> NUMPAGES </w:instrText></w:r>` +
`<w:r><w:fldChar w:fldCharType="end"/></w:r>` +
`</w:p>`,
);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it is rejected', () => {
expect(ok).toBe(false);
});
},
);

test(
'unbalanced begin/end counts are rejected',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = true;

await given('a field with two begins and only one end', () => {
xml = buildDoc(
`<w:p>` +
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:r><w:fldChar w:fldCharType="end"/></w:r>` +
`</w:p>`,
);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it is rejected', () => {
expect(ok).toBe(false);
});
},
);

test(
'ECMA-376 fragmented field modification (unwrapped fldChars + ins/del instrText) is valid',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = false;

await given(
'a field where w:fldChar markers remain unwrapped while w:instrText/w:delInstrText fragment into <w:ins>/<w:del>',
() => {
xml = buildDoc(`<w:p>${MODIFIED_FIELD_FRAGMENTED}</w:p>`);
},
);
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it passes', () => {
expect(ok).toBe(true);
});
},
);

test(
'w:delInstrText outside <w:del> is rejected (ECMA-376 DeletedFieldCode)',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = true;

await given('a field where w:delInstrText appears in a run NOT wrapped by <w:del>', () => {
xml = buildDoc(
`<w:p>` +
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:r><w:delInstrText> NUMPAGES </w:delInstrText></w:r>` +
`<w:r><w:fldChar w:fldCharType="separate"/></w:r>` +
`<w:r><w:fldChar w:fldCharType="end"/></w:r>` +
`</w:p>`,
);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it is rejected', () => {
expect(ok).toBe(false);
});
},
);

test(
'w:delInstrText inside <w:del> but outside any field body is rejected',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = true;

await given('a <w:del> wrapping w:delInstrText with no enclosing field begin/separate', () => {
xml = buildDoc(
`<w:p>` +
`<w:del><w:r><w:delInstrText> NUMPAGES </w:delInstrText></w:r></w:del>` +
`</w:p>`,
);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it is rejected', () => {
expect(ok).toBe(false);
});
},
);

test(
'w:fldChar nested inside <w:del> is rejected (ECMA-376 fatal violation)',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = true;

await given('a <w:del> wrapping a balanced begin/end field-character pair', () => {
xml = buildDoc(
`<w:p>` +
`<w:del>` +
`<w:r><w:fldChar w:fldCharType="begin"/></w:r>` +
`<w:r><w:fldChar w:fldCharType="end"/></w:r>` +
`</w:del>` +
`</w:p>`,
);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it is rejected', () => {
expect(ok).toBe(false);
});
},
);

test(
'w:fldChar inside <w:ins> is allowed (insertion of a new field is conformant)',
async ({ given, when, then }: AllureBddContext) => {
let xml = '';
let ok = false;

await given('an insertion wrapping a complete NUMPAGES field', () => {
xml = buildDoc(`<w:p><w:ins>${COMPLETE_FIELD}</w:ins></w:p>`);
});
await when('the document is validated', () => {
ok = validateFieldStructure(xml);
});
await then('it passes', () => {
expect(ok).toBe(true);
});
},
);
});
56 changes: 42 additions & 14 deletions packages/docx-core/src/baselines/atomizer/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -345,18 +345,27 @@ function buildFailureSummary(
/**
* Validate field structure integrity in document XML.
*
* Checks that fldChar begin/end are balanced and that w:instrText only
* appears inside a proper field sequence (between begin and separate).
* Orphaned instrText elements render as visible text in Word.
* Enforces three ECMA-376 Part 4 constraints on complex fields:
* 1. Global `w:fldChar` begin/end count balance.
* 2. Every `w:instrText` AND `w:delInstrText` sits inside an open field body
* (between `begin` and `separate`). Orphaned instruction text renders as
* literal text in Word.
* 3. `w:delInstrText` is nested inside a `<w:del>` ancestor (DeletedFieldCode
* schema constraint), and conversely `w:fldChar` is NEVER inside `<w:del>`
* (Word treats this as fatal and discards the field state machine).
*
* Called on both pre-accept/reject combined XML (with track-change wrappers)
* and on post-accept/reject XML (wrappers removed). Both cases must satisfy the
* field placement check; constraint (3) is vacuous post-accept/reject.
*/
export function validateFieldStructure(documentXml: string): boolean {
const root = parseDocumentXml(documentXml);

// Walk the document in order, tracking field nesting
const allFldChars = findAllByTagName(root, 'w:fldChar');
const allInstrTexts = findAllByTagName(root, 'w:instrText');
const allDelInstrTexts = findAllByTagName(root, 'w:delInstrText');

// Quick balance check
// Constraint (1): global fldChar begin/end balance.
let begins = 0;
let ends = 0;
for (const fc of allFldChars) {
Expand All @@ -366,19 +375,36 @@ export function validateFieldStructure(documentXml: string): boolean {
}
if (begins !== ends) return false;

// Check that instrText elements are inside a field (between begin and separate).
// Walk all elements in document order using a recursive scan.
if (allInstrTexts.length === 0) return true; // No instrText, nothing to validate
if (
allFldChars.length === 0 &&
allInstrTexts.length === 0 &&
allDelInstrTexts.length === 0
) {
return true;
}

// Depth-first scan to check instrText placement
// Depth-first scan tracking field nesting (for constraint 2) and <w:del>
// ancestor nesting (for constraint 3).
let depth = 0;
const pastSeparatorAtDepth: number[] = []; // track separator state per depth
const pastSeparatorAtDepth: number[] = [];
let insideDelDepth = 0;

function scan(node: Element): boolean {
for (let child = node.firstChild; child; child = child.nextSibling) {
if (child.nodeType !== 1) continue; // skip non-elements
if (child.nodeType !== 1) continue;
const el = child as Element;
const tag = el.tagName;

if (tag === 'w:del') {
insideDelDepth++;
const ok = scan(el);
insideDelDepth--;
if (!ok) return false;
continue;
}

if (el.tagName === 'w:fldChar') {
if (tag === 'w:fldChar') {
if (insideDelDepth > 0) return false;
const type = el.getAttribute('w:fldCharType');
if (type === 'begin') {
depth++;
Expand All @@ -388,8 +414,10 @@ export function validateFieldStructure(documentXml: string): boolean {
} else if (type === 'end') {
if (depth > 0) depth--;
}
} else if (el.tagName === 'w:instrText') {
// instrText must be inside a field (depth > 0) and before the separator
} else if (tag === 'w:instrText') {
if (depth === 0 || pastSeparatorAtDepth[depth]) return false;
} else if (tag === 'w:delInstrText') {
if (insideDelDepth === 0) return false;
if (depth === 0 || pastSeparatorAtDepth[depth]) return false;
}

Expand Down
Loading