Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions go/markup/minml/cmd/minml.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,24 @@ package main

import (
"fmt"
"github.com/dedis/matchertext/go/markup/minml"
"log"
"os"
"strings"

"github.com/dedis/matchertext/go/markup/minml"
"github.com/dedis/matchertext/go/markup/xml"
)

const usage = `MinML Command-Line Tool

USAGE:
%s [COMMAND] <input.minml> [OPTIONS]

ARGS:
<input.minml> MinML source file
%s [COMMAND] <input> [OPTIONS]

COMMANDS:
help Print this help message
convert <file.minml> Parse MinML and write HTML to stdout (default)
server <file|directory> [OPTIONS] Start an HTTP server for MinML conversion
convert <file.minml> Parse MinML and write HTML to stdout (default)
from-xml <file.xml> Convert XML to MinML and write to stdout
server <file|directory> [OPTIONS] Start an HTTP server for MinML conversion

OPTIONS (server):
--port <port> Port to listen on (default: 8080)
Expand All @@ -54,10 +54,12 @@ DESCRIPTION:
EXAMPLES:
%[1]s input.minml
%[1]s convert input.minml
%[1]s from-xml input.xml
%[1]s server input.minml
`

const CmdConvert = "convert"
const CmdFromXML = "from-xml"
const CmdServer = "server"

func main() {
Expand Down Expand Up @@ -89,6 +91,20 @@ func main() {
if err := minml.Convert(inputPath, os.Stdout, true, extensions); err != nil {
log.Fatal(err)
}
case CmdFromXML:
f, err := os.Open(inputPath)
if err != nil {
log.Fatal(err)
}
defer f.Close()

ns, err := xml.NewTreeParser(f).ParseAST()
if err != nil {
log.Fatalf("parsing %s: %v", inputPath, err)
}
if err := minml.NewTreeWriter(os.Stdout).WriteAST(ns); err != nil {
log.Fatalf("writing MinML: %v", err)
}
case CmdServer:
port := "8080"
noOpen := false
Expand Down Expand Up @@ -130,7 +146,7 @@ func parseArgs(args []string) (command string, inputPath string, rest []string)
case "help":
printUsage(args[0])
os.Exit(0)
case CmdConvert, CmdServer:
case CmdConvert, CmdFromXML, CmdServer:
if len(args) < 3 {
log.Fatalf("'%s' requires an input file", args[1])
}
Expand Down
107 changes: 107 additions & 0 deletions go/markup/xml/treeparser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package xml

import (
"encoding/xml"
"fmt"
"io"
"strings"

"github.com/dedis/matchertext/go/markup/ast"
)

// TreeParser parses an XML stream into an abstract syntax tree (AST).
// It uses raw XML tokenization to preserve original element and attribute
// names including namespace prefixes.
type TreeParser struct {
dec *xml.Decoder
}

// NewTreeParser creates a TreeParser to parse XML from r.
func NewTreeParser(r io.Reader) *TreeParser {
dec := xml.NewDecoder(r)
dec.Strict = false
return &TreeParser{dec: dec}
}

// ParseAST parses the XML input and returns the top-level AST nodes.
func (p *TreeParser) ParseAST() ([]ast.Node, error) {
return p.parseContent(false, "")
}

// parseContent reads AST nodes from the XML stream.
// When insideElement is true it stops at the next EndElement, which it consumes.
func (p *TreeParser) parseContent(insideElement bool, elemName string) ([]ast.Node, error) {
var nodes []ast.Node
for {
tok, err := p.dec.RawToken()
if err == io.EOF {
if insideElement {
return nil, fmt.Errorf("unexpected EOF inside element <%s>", elemName)
}
return nodes, nil
}
if err != nil {
return nil, err
}

switch t := tok.(type) {
case xml.CharData:
if s := string(t); s != "" {
nodes = append(nodes, ast.NewText(s))
}

case xml.Comment:
nodes = append(nodes, ast.NewComment(string(t)))

case xml.StartElement:
elt, err := p.parseElement(t)
if err != nil {
return nil, err
}
nodes = append(nodes, elt)

case xml.EndElement:
if !insideElement {
return nil, fmt.Errorf("unexpected end element </%s>", rawXMLName(t.Name))
}
return nodes, nil

case xml.ProcInst, xml.Directive:
// skip processing instructions and DOCTYPE declarations
}
}
}

func (p *TreeParser) parseElement(start xml.StartElement) (ast.Node, error) {
name := rawXMLName(start.Name)

var ns []ast.Node
for _, a := range start.Attr {
attrName := rawXMLName(a.Name)
if isXMLNS(attrName) {
continue // namespace declarations have no meaning in MinML
}
ns = append(ns, ast.NewAttribute(attrName, ast.NewText(a.Value)))
}

content, err := p.parseContent(true, name)
if err != nil {
return nil, err
}

return ast.NewElement(name, append(ns, content...)...), nil
}

// rawXMLName formats an xml.Name as it appeared in the source.
// With RawToken(), Name.Space holds the namespace prefix (not the URI).
func rawXMLName(name xml.Name) string {
if name.Space != "" {
return name.Space + ":" + name.Local
}
return name.Local
}

// isXMLNS reports whether an attribute name is a namespace declaration.
func isXMLNS(name string) bool {
return name == "xmlns" || strings.HasPrefix(name, "xmlns:")
}
94 changes: 94 additions & 0 deletions go/markup/xml/treeparser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package xml

import (
"strings"
"testing"

"github.com/dedis/matchertext/go/markup/ast"
)

type parseTest struct {
xml string
ast []ast.Node
}

func pt(xmlStr string, ns ...ast.Node) parseTest {
return parseTest{xml: xmlStr, ast: ns}
}

var parseTests = []parseTest{

// Empty input
pt(""),
pt(" ", aText(" ")),
pt("<?xml version=\"1.0\"?>"),

// Plain text
pt("hello", aText("hello")),

// Comments
pt("<!-- hi -->", aComment(" hi ")),
pt("<!-- a --><!-- b -->", aComment(" a "), aComment(" b ")),

// Simple elements
pt("<p/>", aElem("p")),
pt("<br/>", aElem("br")),
pt("<em>emphasis</em>", aElem("em", aText("emphasis"))),
pt("<i><b>nested</b></i>", aElem("i", aElem("b", aText("nested")))),

// Self-closing with whitespace-only text around it
pt(" <p/> ", aText(" "), aElem("p"), aText(" ")),

// Elements with attributes
pt(`<a href="foo">link</a>`,
aElem("a", aAttr("href", aText("foo")), aText("link"))),
pt(`<img src="foo" alt="bar"/>`,
aElem("img", aAttr("src", aText("foo")), aAttr("alt", aText("bar")))),

// Namespace prefix on element and attribute
pt(`<tei:p xml:lang="en">text</tei:p>`,
aElem("tei:p", aAttr("xml:lang", aText("en")), aText("text"))),

// xmlns declarations are dropped
pt(`<TEI xmlns="http://www.tei-c.org/ns/1.0"><body/></TEI>`,
aElem("TEI", aElem("body"))),

// Mixed content — representative TEI fragment
pt(`<p>I met <persName ref="#JohnSmith">John</persName>.</p>`,
aElem("p",
aText("I met "),
aElem("persName", aAttr("ref", aText("#JohnSmith")), aText("John")),
aText("."),
)),

// Self-closing element with attributes (lb, gap)
pt(`<lb n="4"/>`, aElem("lb", aAttr("n", aText("4")))),
pt(`<gap reason="damage" extent="2" unit="chars"/>`,
aElem("gap",
aAttr("reason", aText("damage")),
aAttr("extent", aText("2")),
aAttr("unit", aText("chars")),
)),

// DOCTYPE and processing instructions are skipped
pt("<!DOCTYPE foo []><p/>", aElem("p")),
pt("<?xml version=\"1.0\"?><p/>", aElem("p")),
}

func TestTreeParser(t *testing.T) {
for i, pt := range parseTests {
d := NewTreeParser(strings.NewReader(pt.xml))
got, err := d.ParseAST()
if err != nil {
t.Errorf("%v %q: unexpected error: %v", i, pt.xml, err)
continue
}
want := pt.ast
if want == nil {
want = []ast.Node{}
}
if !ast.Equal(got, want) {
t.Errorf("%v %q:\n got %v\n want %v", i, pt.xml, got, want)
}
}
}