From 2fb827d829bdd1404b1ce4c2805b68ba022ecec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Guimar=C3=A3es?= Date: Tue, 31 Mar 2026 16:38:45 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20preserve=20emoji=20ZWJ=20sequences=20thr?= =?UTF-8?q?ough=20YAML=E2=86=94HCL=20roundtrip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hclwrite escapes U+200D (ZWJ, category Cf) via unicode.IsPrint, and yaml.v3's is_printable misses 4-byte UTF-8, causing supplementary-plane emoji to be re-escaped as \UXXXXXXXX on the return trip. Post-process HCL and YAML output at each generation boundary with unescapeHCLUnicode / unescapeYAMLUnicode, restoring raw UTF-8 for codepoints ≥ U+00A0. Adds TestEmojiRoundtripStability to provider/github. --- .../unicode-emoji-zwj-escape-roundtrip.md | 237 ++++++++++++++++++ provider/github/github.go | 2 +- provider/github/roundtrip_test.go | 63 +++++ provider/github/unparse_action.go | 2 +- provider/github/unparse_workflow.go | 27 +- provider/github/workflow_yaml.go | 26 +- provider/gitlab/pipeline_yaml.go | 28 ++- provider/gitlab/unparse_pipeline.go | 27 +- 8 files changed, 406 insertions(+), 6 deletions(-) create mode 100644 docs/solutions/runtime-errors/unicode-emoji-zwj-escape-roundtrip.md diff --git a/docs/solutions/runtime-errors/unicode-emoji-zwj-escape-roundtrip.md b/docs/solutions/runtime-errors/unicode-emoji-zwj-escape-roundtrip.md new file mode 100644 index 0000000..47a721b --- /dev/null +++ b/docs/solutions/runtime-errors/unicode-emoji-zwj-escape-roundtrip.md @@ -0,0 +1,237 @@ +--- +title: Unicode Emoji ZWJ Sequence Escaping in YAML↔HCL Roundtrip +problem_type: logic-errors +component: provider/github, provider/gitlab +symptoms: + - "YAML `name: 👮‍♂️ Lint` → HCL `name = \"👮\\u200d♂️ Lint\"` (ZWJ escaped)" + - "HCL → YAML `name: \"\\U0001F46E‍♂️ Lint\"` (emoji escaped as \\U)" + - "Multi-codepoint emoji sequences corrupted after YAML→HCL→YAML roundtrip" +tags: + - unicode + - emoji + - yaml + - hcl + - roundtrip + - golang +affected_files: + - provider/github/unparse_workflow.go + - provider/github/unparse_action.go + - provider/github/github.go + - provider/github/workflow_yaml.go + - provider/gitlab/unparse_pipeline.go + - provider/gitlab/pipeline_yaml.go + - provider/github/roundtrip_test.go +date: 2026-03-31 +--- + +## Problem Statement + +Emoji sequences using Unicode Zero Width Joiner (U+200D, a ZWJ — Zero Width Joiner) corrupted +during YAML→HCL→YAML roundtrip. A workflow named `👮‍♂️ Lint` would, after one roundtrip, become +`\U0001F46E‍♂️ Lint` in YAML output. + +**Example:** +```yaml +# Input YAML +name: 👮‍♂️ Lint +``` +```hcl +# After YAML→HCL (unparse) +name = "👮\u200d♂️ Lint" # ZWJ escaped by hclwrite +``` +```yaml +# After HCL→YAML (parse) — CORRUPTED +name: "\U0001F46E‍♂️ Lint" # Supplementary-plane emoji escaped by yaml.v3 +``` + +--- + +## Root Cause Analysis + +Two separate library bugs compounded in the roundtrip pipeline: + +### Bug 1: `hclwrite` escapes ZWJ (U+200D) + +**Location:** `github.com/hashicorp/hcl/v2@v2.24.0/hclwrite/generate.go:350` + +`escapeQuotedStringLit` uses Go's `unicode.IsPrint(r)` to decide whether to escape a rune. +Per Go's `unicode` package, category Cf (Format) characters are not printable. U+200D (ZERO WIDTH +JOINER) is category Cf, so it is escaped as `\u200d` in HCL string literals. + +When HCL is parsed back, `\u200d` restores correctly. But the next YAML serialisation step then +encounters the ZWJ next to a supplementary-plane emoji... + +### Bug 2: `yaml.v3` escapes supplementary-plane emoji + +**Location:** `gopkg.in/yaml.v3@v3.0.1/yamlprivateh.go:85` (`is_printable`) + +This function was ported from C's libyaml. Its UTF-8 byte range check only handles characters up +to 3-byte sequences (≤ U+FFFF): + +```go +// Only covers bytes 0xC2–0xED (≤ 3-byte UTF-8 sequences) +b[i] > 0xC2 && b[i] < 0xED +``` + +Supplementary-plane emoji like 👮 (U+1F46E) require 4-byte UTF-8 sequences starting at 0xF0–0xF4. +Since the check misses them, they are treated as non-printable, triggering double-quoted mode and +`\U0001F46E` escaping in `yaml_emitter_write_double_quoted_scalar`. + +### Combined Effect + +``` +👮‍♂️ (U+1F46E U+200D U+2642 U+FE0F) + │ + ▼ yaml.v3 unmarshal → string (correct) + │ + ▼ hclwrite.SetAttributeValue (unparse step) + → escapeQuotedStringLit → `👮\u200d♂️` ← ZWJ escaped + │ + ▼ hclwrite.Format → `name = "👮\u200d♂️ Lint"` + │ + ▼ hcl parse back → `👮\u200d♂️` (restored, ZWJ present again) + │ + ▼ yaml.v3 Marshal → encounters U+1F46E (4-byte), treats as non-printable + → double-quoted mode → `\U0001F46E\u200d\u2642\uFE0F Lint` ← CORRUPT +``` + +--- + +## Investigation Steps + +1. **Reproduced** with minimal YAML: `name: 👮‍♂️ Lint` → unparse → parse +2. **Traced HCL layer**: Found `hclwrite/generate.go:350` uses `unicode.IsPrint`; category Cf = + non-printable → ZWJ escaped +3. **Traced YAML layer**: Found `yaml.v3/yamlprivateh.go:85` `is_printable` only handles 3-byte + UTF-8; 4-byte sequences fall through to `\U` escaping +4. **Confirmed** both bugs are in external (vendored) libraries — not modifiable +5. **Chose post-processing** at output boundaries rather than patching external libraries + +--- + +## Solution + +Post-process all HCL and YAML output at the respective generation boundary, replacing `\uXXXX` / +`\UXXXXXXXX` escape sequences with their raw UTF-8 equivalents for codepoints ≥ U+00A0 (YAML 1.2 +printable range). ASCII (U+0000–U+007F) and C1 controls (U+0080–U+009F) remain escaped. + +### HCL Output Post-Processor + +Added to `provider/github/unparse_workflow.go` (used by all HCL generators via `unescapeHCLUnicode`): + +```go +func unescapeHCLUnicode(src []byte) []byte { + return reHCLUnicodeEscape.ReplaceAllFunc(src, func(match []byte) []byte { + n, err := strconv.ParseInt(string(match[2:]), 16, 32) + if err != nil || n <= 0x9F || !utf8.ValidRune(rune(n)) { + return match + } + var buf [utf8.UTFMax]byte + l := utf8.EncodeRune(buf[:], rune(n)) + return append([]byte(nil), buf[:l]...) + }) +} + +var reHCLUnicodeEscape = regexp.MustCompile(`\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}`) +``` + +Applied in every HCL generation return: +```go +return unescapeHCLUnicode(hclwrite.Format(f.Bytes())), nil +``` + +### YAML Output Post-Processor + +Added to `provider/github/workflow_yaml.go` and `provider/gitlab/pipeline_yaml.go`: + +```go +func unescapeYAMLUnicode(src []byte) []byte { + return reYAMLUnicodeEscape.ReplaceAllFunc(src, func(match []byte) []byte { + n, err := strconv.ParseInt(string(match[2:]), 16, 32) + if err != nil || n <= 0x9F || !utf8.ValidRune(rune(n)) { + return match + } + var buf [utf8.UTFMax]byte + l := utf8.EncodeRune(buf[:], rune(n)) + return append([]byte(nil), buf[:l]...) + }) +} + +var reYAMLUnicodeEscape = regexp.MustCompile(`\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4}`) +``` + +Applied in `marshalWorkflowYAML`: +```go +// before return +return unescapeYAMLUnicode(out), nil +``` + +### Files Modified + +| File | Change | +|------|--------| +| `provider/github/unparse_workflow.go` | Added `unescapeHCLUnicode` + applied to all HCL returns | +| `provider/github/unparse_action.go` | Applied `unescapeHCLUnicode` | +| `provider/github/github.go` | Applied `unescapeHCLUnicode` | +| `provider/github/workflow_yaml.go` | Added `unescapeYAMLUnicode` + applied to YAML output | +| `provider/gitlab/unparse_pipeline.go` | Added `unescapeHCLUnicode` + applied to all HCL returns | +| `provider/gitlab/pipeline_yaml.go` | Added `unescapeYAMLUnicode` + applied to YAML output | +| `provider/github/roundtrip_test.go` | Added `TestEmojiRoundtripStability` | + +--- + +## Prevention Strategies + +### 1. Test Coverage (Primary Prevention) + +The `TestEmojiRoundtripStability` test in `provider/github/roundtrip_test.go` provides direct +regression coverage: + +```go +func TestEmojiRoundtripStability(t *testing.T) { + emojiName := "\U0001F46E\u200D\u2642\uFE0F Lint" // 👮‍♂️ Lint + // Verifies: no \u200d in HCL, no \U in YAML, emoji preserved verbatim +} +``` + +### 2. When to Apply the Same Pattern + +Apply the same `unescapeXXXUnicode` post-processing pattern at **any output boundary** where: +- An external serialisation library is in use +- That library uses `unicode.IsPrint` or equivalent heuristics +- User-controlled strings may contain non-BMP / category-Cf characters + +Common triggers: emoji, RTL marks (U+200E/200F), word joiners (U+2060), BOM (U+FEFF). + +### 3. Threshold: U+009F + +The post-processors only unescape codepoints `> 0x9F`. This preserves: +- ASCII (U+0000–U+007F): standard escaping rules apply +- C1 controls (U+0080–U+009F): genuinely unsafe in YAML/HCL + +Everything ≥ U+00A0 is valid UTF-8 in both YAML 1.2 and HCL, so it can safely be unescaped. + +### 4. New Provider Checklist + +When implementing a new provider or serialisation path, apply both unescapers: +- HCL generation: wrap `hclwrite.Format(f.Bytes())` with `unescapeHCLUnicode(...)` +- YAML generation: wrap final `[]byte` output with `unescapeYAMLUnicode(...)` + +--- + +## Known Limitations + +- This fix does not cover other binary/non-UTF-8 content (which should remain escaped) +- If `hclwrite` or `yaml.v3` upstream fixes their libraries, these post-processors become no-ops + (harmless, since they only act on escape sequences that wouldn't be present) +- The YAML post-processor targets double-quoted `\UXXXXXXXX` form; single-quoted YAML strings + cannot contain escape sequences (yaml.v3 won't produce them for format chars) + +--- + +## External Library Notes + +| Library | Version | Issue | Status | +|---------|---------|-------|--------| +| `github.com/hashicorp/hcl/v2` | v2.24.0 | `hclwrite` uses `unicode.IsPrint` → escapes Cf chars | Upstream, not fixed | +| `gopkg.in/yaml.v3` | v3.0.1 | `is_printable` misses 4-byte UTF-8 (supplementary-plane) | Upstream bug, not fixed | diff --git a/provider/github/github.go b/provider/github/github.go index aa13002..4809564 100644 --- a/provider/github/github.go +++ b/provider/github/github.go @@ -231,5 +231,5 @@ func unparseYAMLFile(yamlBytes []byte, baseName string) ([]byte, error) { } } - return hclwrite.Format(f.Bytes()), nil + return unescapeHCLUnicode(hclwrite.Format(f.Bytes())), nil } diff --git a/provider/github/roundtrip_test.go b/provider/github/roundtrip_test.go index 8a4616c..96d794f 100644 --- a/provider/github/roundtrip_test.go +++ b/provider/github/roundtrip_test.go @@ -229,3 +229,66 @@ workflow "wf" { assertYAMLValueEqual(t, first, second) } + +// TestEmojiRoundtripStability verifies that workflow names containing emoji +// ZWJ sequences (e.g. 👮‍♂️) round-trip through YAML → HCL → YAML without +// acquiring Unicode escape sequences in either the HCL or the output YAML. +func TestEmojiRoundtripStability(t *testing.T) { + tmpDir := t.TempDir() + + inputYAML := filepath.Join(tmpDir, "lint.yaml") + unparseDir := filepath.Join(tmpDir, "unparse") + parse2Dir := filepath.Join(tmpDir, "parse2") + + // The workflow name contains a ZWJ emoji sequence: 👮 (U+1F46E) + ZWJ + // (U+200D) + ♂ (U+2642) + VS-16 (U+FE0F). hclwrite escapes U+200D as + // \u200d and yaml.v3 escapes supplementary-plane chars as \UXXXXXXXX, so + // without the fix both representations acquire Unicode escapes. + emojiName := "\U0001F46E\u200D\u2642\uFE0F Lint" + content := "name: " + emojiName + "\non:\n push:\njobs:\n lint:\n runs-on: ubuntu-latest\n steps:\n - id: lint\n name: Run lint\n run: echo lint\n" + + if err := os.WriteFile(inputYAML, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + p := New() + + if err := p.Unparse(provider.ProviderOps{File: inputYAML, OutputDirectory: unparseDir}); err != nil { + t.Fatal(err) + } + + hclFile := filepath.Join(unparseDir, "lint.hcl") + hclBytes, err := os.ReadFile(hclFile) + + if err != nil { + t.Fatal(err) + } + + // HCL must not contain Unicode escape sequences for the emoji ZWJ. + hclStr := string(hclBytes) + + if strings.Contains(hclStr, `\u200d`) || strings.Contains(hclStr, `\u200D`) { + t.Errorf("HCL output contains escaped ZWJ (\\u200d); want raw UTF-8:\n%s", hclStr) + } + + if err := p.Parse(provider.ProviderOps{File: hclFile, OutputDirectory: parse2Dir}); err != nil { + t.Fatal(err) + } + + roundtripped, err := os.ReadFile(filepath.Join(parse2Dir, "lint.yaml")) + if err != nil { + t.Fatal(err) + } + + // Round-tripped YAML must not contain Unicode escape sequences. + roundStr := string(roundtripped) + + if strings.Contains(roundStr, `\U`) || strings.Contains(roundStr, `\u`) { + t.Errorf("round-tripped YAML contains Unicode escapes; want raw UTF-8:\n%s", roundStr) + } + + // The emoji name must be preserved verbatim in the output. + if !strings.Contains(roundStr, emojiName) { + t.Errorf("round-tripped YAML does not contain emoji name %q:\n%s", emojiName, roundStr) + } +} diff --git a/provider/github/unparse_action.go b/provider/github/unparse_action.go index 502711f..cfb138c 100644 --- a/provider/github/unparse_action.go +++ b/provider/github/unparse_action.go @@ -217,7 +217,7 @@ func actionToHCL(doc map[string]any, filename string) ([]byte, error) { } } - return hclwrite.Format(f.Bytes()), nil + return unescapeHCLUnicode(hclwrite.Format(f.Bytes())), nil } func writeActionSteps(root *hclwrite.Body, raw any) ([]string, error) { diff --git a/provider/github/unparse_workflow.go b/provider/github/unparse_workflow.go index 48eb157..d94c20b 100644 --- a/provider/github/unparse_workflow.go +++ b/provider/github/unparse_workflow.go @@ -6,6 +6,9 @@ package github import ( "errors" "fmt" + "regexp" + "strconv" + "unicode/utf8" "github.com/goccy/go-yaml" "github.com/hashicorp/hcl/v2/hclsyntax" @@ -80,9 +83,31 @@ func workflowToHCL(doc ghworkflow.YAMLDocument, filename string) ([]byte, error) return nil, err } - return hclwrite.Format(f.Bytes()), nil + return unescapeHCLUnicode(hclwrite.Format(f.Bytes())), nil } +// unescapeHCLUnicode replaces \uXXXX and \UXXXXXXXX escape sequences in HCL +// source with their raw UTF-8 equivalents for characters above U+009F. +// hclwrite escapes any rune where Go's unicode.IsPrint returns false, which +// includes category-Cf characters like U+200D (ZWJ) used in emoji sequences. +// These are valid UTF-8 in HCL strings; keeping them escaped causes downstream +// YAML serialisers to re-escape the surrounding emoji. +func unescapeHCLUnicode(src []byte) []byte { + return reHCLUnicodeEscape.ReplaceAllFunc(src, func(match []byte) []byte { + n, err := strconv.ParseInt(string(match[2:]), 16, 32) + if err != nil || n <= 0x9F || !utf8.ValidRune(rune(n)) { + return match + } + + var buf [utf8.UTFMax]byte + l := utf8.EncodeRune(buf[:], rune(n)) + + return append([]byte(nil), buf[:l]...) + }) +} + +var reHCLUnicodeEscape = regexp.MustCompile(`\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}`) + func writeJobBody(root *hclwrite.Body, jobBody *hclwrite.Body, jobID string, job map[string]any, jobIDMap map[string]string, generatedVariables map[string]any) error { stepRefs := []string{} diff --git a/provider/github/workflow_yaml.go b/provider/github/workflow_yaml.go index 2ef50f5..5bdc72b 100644 --- a/provider/github/workflow_yaml.go +++ b/provider/github/workflow_yaml.go @@ -6,8 +6,11 @@ package github import ( "bytes" "fmt" + "regexp" "sort" + "strconv" "strings" + "unicode/utf8" yamlv3 "gopkg.in/yaml.v3" ) @@ -45,9 +48,30 @@ func marshalWorkflowYAML(workflow map[string]any) ([]byte, error) { out := bytes.ReplaceAll(buf.Bytes(), []byte(": {}\n"), []byte(":\n")) - return out, nil + return unescapeYAMLUnicode(out), nil } +// unescapeYAMLUnicode replaces \uXXXX and \UXXXXXXXX escape sequences in YAML +// output with their raw UTF-8 equivalents for characters above U+009F. +// gopkg.in/yaml.v3 escapes supplementary-plane characters (emoji etc.) because +// its is_printable helper only handles 3-byte UTF-8 sequences. Replacing the +// escapes restores readable output without changing the YAML semantics. +func unescapeYAMLUnicode(src []byte) []byte { + return reYAMLUnicodeEscape.ReplaceAllFunc(src, func(match []byte) []byte { + n, err := strconv.ParseInt(string(match[2:]), 16, 32) + if err != nil || n <= 0x9F || !utf8.ValidRune(rune(n)) { + return match + } + + var buf [utf8.UTFMax]byte + l := utf8.EncodeRune(buf[:], rune(n)) + + return append([]byte(nil), buf[:l]...) + }) +} + +var reYAMLUnicodeEscape = regexp.MustCompile(`\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4}`) + func workflowMapNode(workflow map[string]any) (*yamlv3.Node, error) { node := &yamlv3.Node{Kind: yamlv3.MappingNode} diff --git a/provider/gitlab/pipeline_yaml.go b/provider/gitlab/pipeline_yaml.go index f870c59..079e991 100644 --- a/provider/gitlab/pipeline_yaml.go +++ b/provider/gitlab/pipeline_yaml.go @@ -6,7 +6,10 @@ package gitlab import ( "bytes" "fmt" + "regexp" "sort" + "strconv" + "unicode/utf8" yamlv3 "gopkg.in/yaml.v3" ) @@ -33,9 +36,32 @@ func marshalPipelineYAML(pipeline map[string]any) ([]byte, error) { return nil, err } - return bytes.ReplaceAll(buf.Bytes(), []byte(": {}\n"), []byte(":\n")), nil + out := bytes.ReplaceAll(buf.Bytes(), []byte(": {}\n"), []byte(":\n")) + + return unescapeYAMLUnicode(out), nil } +// unescapeYAMLUnicode replaces \uXXXX and \UXXXXXXXX escape sequences in YAML +// output with their raw UTF-8 equivalents for characters above U+009F. +// gopkg.in/yaml.v3 escapes supplementary-plane characters (emoji etc.) because +// its is_printable helper only handles 3-byte UTF-8 sequences. Replacing the +// escapes restores readable output without changing the YAML semantics. +func unescapeYAMLUnicode(src []byte) []byte { + return reYAMLUnicodeEscape.ReplaceAllFunc(src, func(match []byte) []byte { + n, err := strconv.ParseInt(string(match[2:]), 16, 32) + if err != nil || n <= 0x9F || !utf8.ValidRune(rune(n)) { + return match + } + + var buf [utf8.UTFMax]byte + l := utf8.EncodeRune(buf[:], rune(n)) + + return append([]byte(nil), buf[:l]...) + }) +} + +var reYAMLUnicodeEscape = regexp.MustCompile(`\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4}`) + func pipelineMapNode(pipeline map[string]any) (*yamlv3.Node, error) { node := &yamlv3.Node{Kind: yamlv3.MappingNode} seen := map[string]struct{}{} diff --git a/provider/gitlab/unparse_pipeline.go b/provider/gitlab/unparse_pipeline.go index 97cf03e..4013daf 100644 --- a/provider/gitlab/unparse_pipeline.go +++ b/provider/gitlab/unparse_pipeline.go @@ -6,8 +6,11 @@ package gitlab import ( "fmt" "os" + "regexp" "sort" + "strconv" "strings" + "unicode/utf8" "github.com/goccy/go-yaml" "github.com/hashicorp/hcl/v2/hclwrite" @@ -292,9 +295,31 @@ func pipelineToHCL(doc map[string]any, filename string) ([]byte, error) { _ = filename - return hclwrite.Format(f.Bytes()), nil + return unescapeHCLUnicode(hclwrite.Format(f.Bytes())), nil } +// unescapeHCLUnicode replaces \uXXXX and \UXXXXXXXX escape sequences in HCL +// source with their raw UTF-8 equivalents for characters above U+009F. +// hclwrite escapes any rune where Go's unicode.IsPrint returns false, which +// includes category-Cf characters like U+200D (ZWJ) used in emoji sequences. +// These are valid UTF-8 in HCL strings; keeping them escaped causes downstream +// YAML serialisers to re-escape the surrounding emoji. +func unescapeHCLUnicode(src []byte) []byte { + return reHCLUnicodeEscape.ReplaceAllFunc(src, func(match []byte) []byte { + n, err := strconv.ParseInt(string(match[2:]), 16, 32) + if err != nil || n <= 0x9F || !utf8.ValidRune(rune(n)) { + return match + } + + var buf [utf8.UTFMax]byte + l := utf8.EncodeRune(buf[:], rune(n)) + + return append([]byte(nil), buf[:l]...) + }) +} + +var reHCLUnicodeEscape = regexp.MustCompile(`\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}`) + func writeJobBlock(body *hclwrite.Body, job map[string]any, jobIDMap map[string]string, templateIDMap map[string]string) error { for _, key := range sortedKeys(job) { value := job[key]