From b5fba2a870c86ebcff88f2c6febf82604838aae8 Mon Sep 17 00:00:00 2001 From: Philip Hamelink Date: Mon, 18 May 2026 17:19:52 +0200 Subject: [PATCH] feat: add TEI (XML) -> MinML converter --- go/markup/minml/cmd/minml.go | 32 ++++++--- go/markup/xml/treeparser.go | 107 +++++++++++++++++++++++++++++++ go/markup/xml/treeparser_test.go | 94 +++++++++++++++++++++++++++ 3 files changed, 225 insertions(+), 8 deletions(-) create mode 100644 go/markup/xml/treeparser.go create mode 100644 go/markup/xml/treeparser_test.go diff --git a/go/markup/minml/cmd/minml.go b/go/markup/minml/cmd/minml.go index 908a311..5706112 100644 --- a/go/markup/minml/cmd/minml.go +++ b/go/markup/minml/cmd/minml.go @@ -23,24 +23,24 @@ package main import ( "fmt" - "github.com/dedis/matchertext/go/markup/minml" "log" "os" "strings" + + "github.com/dedis/matchertext/go/markup/minml" + "github.com/dedis/matchertext/go/markup/xml" ) const usage = `MinML Command-Line Tool USAGE: - %s [COMMAND] [OPTIONS] - -ARGS: - MinML source file + %s [COMMAND] [OPTIONS] COMMANDS: help Print this help message - convert Parse MinML and write HTML to stdout (default) - server [OPTIONS] Start an HTTP server for MinML conversion + convert Parse MinML and write HTML to stdout (default) + from-xml Convert XML to MinML and write to stdout + server [OPTIONS] Start an HTTP server for MinML conversion OPTIONS (server): --port Port to listen on (default: 8080) @@ -54,10 +54,12 @@ DESCRIPTION: EXAMPLES: %[1]s input.minml %[1]s convert input.minml + %[1]s from-xml input.xml %[1]s server input.minml ` const CmdConvert = "convert" +const CmdFromXML = "from-xml" const CmdServer = "server" func main() { @@ -89,6 +91,20 @@ func main() { if err := minml.Convert(inputPath, os.Stdout, true, extensions); err != nil { log.Fatal(err) } + case CmdFromXML: + f, err := os.Open(inputPath) + if err != nil { + log.Fatal(err) + } + defer f.Close() + + ns, err := xml.NewTreeParser(f).ParseAST() + if err != nil { + log.Fatalf("parsing %s: %v", inputPath, err) + } + if err := minml.NewTreeWriter(os.Stdout).WriteAST(ns); err != nil { + log.Fatalf("writing MinML: %v", err) + } case CmdServer: port := "8080" noOpen := false @@ -130,7 +146,7 @@ func parseArgs(args []string) (command string, inputPath string, rest []string) case "help": printUsage(args[0]) os.Exit(0) - case CmdConvert, CmdServer: + case CmdConvert, CmdFromXML, CmdServer: if len(args) < 3 { log.Fatalf("'%s' requires an input file", args[1]) } diff --git a/go/markup/xml/treeparser.go b/go/markup/xml/treeparser.go new file mode 100644 index 0000000..19ee7fc --- /dev/null +++ b/go/markup/xml/treeparser.go @@ -0,0 +1,107 @@ +package xml + +import ( + "encoding/xml" + "fmt" + "io" + "strings" + + "github.com/dedis/matchertext/go/markup/ast" +) + +// TreeParser parses an XML stream into an abstract syntax tree (AST). +// It uses raw XML tokenization to preserve original element and attribute +// names including namespace prefixes. +type TreeParser struct { + dec *xml.Decoder +} + +// NewTreeParser creates a TreeParser to parse XML from r. +func NewTreeParser(r io.Reader) *TreeParser { + dec := xml.NewDecoder(r) + dec.Strict = false + return &TreeParser{dec: dec} +} + +// ParseAST parses the XML input and returns the top-level AST nodes. +func (p *TreeParser) ParseAST() ([]ast.Node, error) { + return p.parseContent(false, "") +} + +// parseContent reads AST nodes from the XML stream. +// When insideElement is true it stops at the next EndElement, which it consumes. +func (p *TreeParser) parseContent(insideElement bool, elemName string) ([]ast.Node, error) { + var nodes []ast.Node + for { + tok, err := p.dec.RawToken() + if err == io.EOF { + if insideElement { + return nil, fmt.Errorf("unexpected EOF inside element <%s>", elemName) + } + return nodes, nil + } + if err != nil { + return nil, err + } + + switch t := tok.(type) { + case xml.CharData: + if s := string(t); s != "" { + nodes = append(nodes, ast.NewText(s)) + } + + case xml.Comment: + nodes = append(nodes, ast.NewComment(string(t))) + + case xml.StartElement: + elt, err := p.parseElement(t) + if err != nil { + return nil, err + } + nodes = append(nodes, elt) + + case xml.EndElement: + if !insideElement { + return nil, fmt.Errorf("unexpected end element ", rawXMLName(t.Name)) + } + return nodes, nil + + case xml.ProcInst, xml.Directive: + // skip processing instructions and DOCTYPE declarations + } + } +} + +func (p *TreeParser) parseElement(start xml.StartElement) (ast.Node, error) { + name := rawXMLName(start.Name) + + var ns []ast.Node + for _, a := range start.Attr { + attrName := rawXMLName(a.Name) + if isXMLNS(attrName) { + continue // namespace declarations have no meaning in MinML + } + ns = append(ns, ast.NewAttribute(attrName, ast.NewText(a.Value))) + } + + content, err := p.parseContent(true, name) + if err != nil { + return nil, err + } + + return ast.NewElement(name, append(ns, content...)...), nil +} + +// rawXMLName formats an xml.Name as it appeared in the source. +// With RawToken(), Name.Space holds the namespace prefix (not the URI). +func rawXMLName(name xml.Name) string { + if name.Space != "" { + return name.Space + ":" + name.Local + } + return name.Local +} + +// isXMLNS reports whether an attribute name is a namespace declaration. +func isXMLNS(name string) bool { + return name == "xmlns" || strings.HasPrefix(name, "xmlns:") +} diff --git a/go/markup/xml/treeparser_test.go b/go/markup/xml/treeparser_test.go new file mode 100644 index 0000000..af383ef --- /dev/null +++ b/go/markup/xml/treeparser_test.go @@ -0,0 +1,94 @@ +package xml + +import ( + "strings" + "testing" + + "github.com/dedis/matchertext/go/markup/ast" +) + +type parseTest struct { + xml string + ast []ast.Node +} + +func pt(xmlStr string, ns ...ast.Node) parseTest { + return parseTest{xml: xmlStr, ast: ns} +} + +var parseTests = []parseTest{ + + // Empty input + pt(""), + pt(" ", aText(" ")), + pt(""), + + // Plain text + pt("hello", aText("hello")), + + // Comments + pt("", aComment(" hi ")), + pt("", aComment(" a "), aComment(" b ")), + + // Simple elements + pt("

", aElem("p")), + pt("
", aElem("br")), + pt("emphasis", aElem("em", aText("emphasis"))), + pt("nested", aElem("i", aElem("b", aText("nested")))), + + // Self-closing with whitespace-only text around it + pt("

", aText(" "), aElem("p"), aText(" ")), + + // Elements with attributes + pt(`link`, + aElem("a", aAttr("href", aText("foo")), aText("link"))), + pt(`bar`, + aElem("img", aAttr("src", aText("foo")), aAttr("alt", aText("bar")))), + + // Namespace prefix on element and attribute + pt(`text`, + aElem("tei:p", aAttr("xml:lang", aText("en")), aText("text"))), + + // xmlns declarations are dropped + pt(``, + aElem("TEI", aElem("body"))), + + // Mixed content — representative TEI fragment + pt(`

I met John.

`, + aElem("p", + aText("I met "), + aElem("persName", aAttr("ref", aText("#JohnSmith")), aText("John")), + aText("."), + )), + + // Self-closing element with attributes (lb, gap) + pt(``, aElem("lb", aAttr("n", aText("4")))), + pt(``, + aElem("gap", + aAttr("reason", aText("damage")), + aAttr("extent", aText("2")), + aAttr("unit", aText("chars")), + )), + + // DOCTYPE and processing instructions are skipped + pt("

", aElem("p")), + pt("

", aElem("p")), +} + +func TestTreeParser(t *testing.T) { + for i, pt := range parseTests { + d := NewTreeParser(strings.NewReader(pt.xml)) + got, err := d.ParseAST() + if err != nil { + t.Errorf("%v %q: unexpected error: %v", i, pt.xml, err) + continue + } + want := pt.ast + if want == nil { + want = []ast.Node{} + } + if !ast.Equal(got, want) { + t.Errorf("%v %q:\n got %v\n want %v", i, pt.xml, got, want) + } + } +}