diff --git a/README.md b/README.md index 8b3c35e..410ae44 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,9 @@ Overview Change Logs === +2020-08-?? +- Add XML stream loading and parsing support. + 2019-11-11 - Add XPath query caching. @@ -48,26 +51,58 @@ if err != nil { } ``` -#### Parse a XML from URL. +#### Parse an XML from URL. ```go doc, err := xmlquery.LoadURL("http://www.example.com/sitemap.xml") ``` -#### Parse a XML from string. +#### Parse an XML from string. ```go s := `` doc, err := xmlquery.Parse(strings.NewReader(s)) ``` -#### Parse a XML from io.Reader. +#### Parse an XML from io.Reader. ```go f, err := os.Open("../books.xml") doc, err := xmlquery.Parse(f) ``` +#### Parse an XML in a stream fashion (simple case without element filtering). + +```go +f, err := os.Open("../books.xml") +p, err := xmlquery.CreateStreamParser(f, "/bookstore/book") +for { + n, err := p.Read() + if err == io.EOF { + break + } + if err != nil { + ... + } +} +``` + +#### Parse an XML in a stream fashion (simple case advanced element filtering). + +```go +f, err := os.Open("../books.xml") +p, err := xmlquery.CreateStreamParser(f, "/bookstore/book", "/bookstore/book[price>=10]") +for { + n, err := p.Read() + if err == io.EOF { + break + } + if err != nil { + ... + } +} +``` + #### Find authors of all books in the bookstore. ```go @@ -210,11 +245,11 @@ func main(){ List of supported XPath query packages === -|Name |Description | -|--------------------------|----------------| -|[htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document| -|[xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document| -|[jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document| +| Name | Description | +| ------------------------------------------------- | ----------------------------------------- | +| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document | +| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document | +| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document | Questions === diff --git a/parse.go b/parse.go index 618ad2c..cf7ab39 100644 --- a/parse.go +++ b/parse.go @@ -3,10 +3,12 @@ package xmlquery import ( "encoding/xml" "errors" + "fmt" "io" "net/http" "strings" + "github.com/antchfx/xpath" "golang.org/x/net/html/charset" ) @@ -20,12 +22,30 @@ func LoadURL(url string) (*Node, error) { return Parse(resp.Body) } +// Parse returns the parse tree for the XML from the given Reader. +func Parse(r io.Reader) (*Node, error) { + p := createParser(r) + for { + _, err := p.parse() + if err == io.EOF { + return p.doc, nil + } + if err != nil { + return nil, err + } + } +} + type parser struct { - decoder *xml.Decoder - doc *Node - space2prefix map[string]string - level int - prev *Node + decoder *xml.Decoder + doc *Node + space2prefix map[string]string + level int + prev *Node + streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s). + streamElementFilter *xpath.Expr // If specified, it provides a futher filtering on the target element. + streamNode *Node // Need to remmeber the last target node So we can clean it up upon next Read() call. + streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev. } func createParser(r io.Reader) *parser { @@ -43,6 +63,8 @@ func createParser(r io.Reader) *parser { } func (p *parser) parse() (*Node, error) { + var streamElementNodeCounter int + for { tok, err := p.decoder.Token() if err != nil { @@ -99,10 +121,54 @@ func (p *parser) parse() (*Node, error) { } addSibling(p.prev.Parent, node) } + // If we're in the streaming mode, we need to remember the node if it is the target node + // so that when we finish processing the node's EndElement, we know how/what to return to + // caller. Also we need to remove the target node from the tree upon next Read() call so + // memory doesn't grow unbounded. + if p.streamElementXPath != nil { + if p.streamNode == nil { + if QuerySelector(p.doc, p.streamElementXPath) != nil { + p.streamNode = node + p.streamNodePrev = p.prev + streamElementNodeCounter = 1 + } + } else { + streamElementNodeCounter++ + } + } p.prev = node p.level++ case xml.EndElement: p.level-- + // If we're in streaming mode, and we already have a potential streaming + // target node identified (p.streamNode != nil) then we need to check if + // this is the real one we want to return to caller. + if p.streamNode != nil { + streamElementNodeCounter-- + if streamElementNodeCounter == 0 { + // Now we know this element node is the at least passing the initial + // p.streamElementXPath check and is a potential target node candidate. + // We need to have 1 more check with p.streamElementFilter (if given) to + // ensure it is really the element node we want. + // The reason we need a two-step check process is because the following + // situation: + // b1 + // And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during + // xml.StartElement time, the node is still empty, so it will pass + // the p.streamElementXPath check. However, eventually we know this + // shouldn't be returned to the caller. Having a second more fine-grained + // filter check ensures that. So in this case, the caller should really + // setup the stream parser with: + // streamElementXPath = "/AAA/BBB[" + // streamElementFilter = "/AAA/BBB[. != 'b1']" + if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil { + return p.streamNode, nil + } + // otherwise, this isn't our target node. clean things up. + p.streamNode = nil + p.streamNodePrev = nil + } + } case xml.CharData: node := &Node{Type: CharDataNode, Data: string(tok), level: p.level} if p.level == p.prev.level { @@ -150,16 +216,87 @@ func (p *parser) parse() (*Node, error) { } } -// Parse returns the parse tree for the XML from the given Reader. -func Parse(r io.Reader) (*Node, error) { - p := createParser(r) - for { - _, err := p.parse() - if err == io.EOF { - return p.doc, nil - } +// StreamParser enables loading and parsing an XML document in a streaming fashion. +type StreamParser struct { + p *parser +} + +// CreateStreamParser creates a StreamParser. Argument streamElementXPath is required. +// Argument streamElementFilter is optional and should only be used in advanced scenarios. +// +// Scenario 1: simple case: +// xml := `b1b2` +// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB") +// if err != nil { +// panic(err) +// } +// for { +// n, err := sp.Read() +// if err != nil { +// break +// } +// fmt.Println(n.OutputXML(true)) +// } +// Output will be: +// b1 +// b2 +// +// Scenario 2: advanced case: +// xml := `b1b2` +// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']") +// if err != nil { +// panic(err) +// } +// for { +// n, err := sp.Read() +// if err != nil { +// break +// } +// fmt.Println(n.OutputXML(true)) +// } +// Output will be: +// b2 +// +// As the argument names indicate, streamElementXPath should be used for providing xpath query pointing +// to the target element node only, no extra filtering on the element itself or its children; while +// streamElementFilter, if needed, can provide additional filtering on the target element and its children. +// +// CreateStreamParser returns error if either streamElementXPath or streamElementFilter, if provided, cannot +// be successfully parsed and compiled into a valid xpath query. +func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) { + elemXPath, err := getQuery(streamElementXPath) + if err != nil { + return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error()) + } + elemFilter := (*xpath.Expr)(nil) + if len(streamElementFilter) > 0 { + elemFilter, err = getQuery(streamElementFilter[0]) if err != nil { - return nil, err + return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error()) } } + sp := &StreamParser{ + p: createParser(r), + } + sp.p.streamElementXPath = elemXPath + sp.p.streamElementFilter = elemFilter + return sp, nil +} + +// Read returns a target node that satisifies the XPath specified by caller at StreamParser creation +// time. If there is no more satisifying target node after reading the rest of the XML document, io.EOF +// will be returned. At any time, any XML parsing error encountered, the error will be returned and +// the stream parsing is stopped. Calling Read() after an error is returned (including io.EOF) is not +// allowed the behavior will be undefined. Also note, due to the streaming nature, calling Read() will +// automatically remove any previous target node(s) from the document tree. +func (sp *StreamParser) Read() (*Node, error) { + // Because this is a streaming read, we need to release/remove last + // target node from the node tree to free up memory. + if sp.p.streamNode != nil { + removeFromTree(sp.p.streamNode) + sp.p.prev = sp.p.streamNodePrev + sp.p.streamNode = nil + sp.p.streamNodePrev = nil + } + return sp.p.parse() } diff --git a/parse_test.go b/parse_test.go index 8c1f547..894ab51 100644 --- a/parse_test.go +++ b/parse_test.go @@ -1,6 +1,7 @@ package xmlquery import ( + "io" "net/http" "net/http/httptest" "strings" @@ -250,3 +251,146 @@ func TestCharData(t *testing.T) { testValue(t, cdata.InnerText(), "Richard Lawler") } + +func TestStreamParser_InvalidXPath(t *testing.T) { + sp, err := CreateStreamParser(strings.NewReader(""), "[invalid") + if err == nil || err.Error() != "invalid streamElementXPath '[invalid', err: expression must evaluate to a node-set" { + t.Fatalf("got non-expected error: %v", err) + } + if sp != nil { + t.Fatal("expected nil for sp, but got none-nil value") + } + + sp, err = CreateStreamParser(strings.NewReader(""), ".", "[invalid") + if err == nil || err.Error() != "invalid streamElementFilter '[invalid', err: expression must evaluate to a node-set" { + t.Fatalf("got non-expected error: %v", err) + } + if sp != nil { + t.Fatal("expected nil for sp, but got none-nil value") + } +} + +func root(n *Node) *Node { + if n == nil { + return nil + } + for ; n.Parent != nil; n = n.Parent { + } + return n +} + +func testOutputXML(t *testing.T, msg string, expectedXML string, n *Node) { + if n.OutputXML(true) != expectedXML { + t.Fatalf("%s, expected XML: '%s', actual: '%s'", msg, expectedXML, n.OutputXML(true)) + } +} + +func TestStreamParser_Success1(t *testing.T) { + s := ` + + c1 + b1 + d1 + b2z1 + b3 + b4 + b5 + c3 + ` + + sp, err := CreateStreamParser(strings.NewReader(s), "/AAA/BBB", "/AAA/BBB[. != 'b3']") + if err != nil { + t.Fatal(err.Error()) + } + + // First `` read + n, err := sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "first call result", `b1`, n) + testOutputXML(t, "doc after first call", `<>c1b1`, root(n)) + + // Second `` read + n, err = sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "second call result", `b2z1`, n) + testOutputXML(t, "doc after second call", + `<>c1d1b2z1`, root(n)) + + // Third `` read (Note we will skip 'b3' since the streamElementFilter excludes it) + n, err = sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "third call result", `b4`, n) + // Note the inclusion of `b3` in the document? This is because `b3` has + // been filtered out and is not our target node, thus it is considered just like any other + // non target nodes such as ``` or `` + testOutputXML(t, "doc after third call", + `<>c1d1b3b4`, root(n)) + + // Fourth `` read + n, err = sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "fourth call result", `b5`, n) + // Note the inclusion of `b3` in the document. + testOutputXML(t, "doc after fourth call", + `<>c1d1b3b5`, root(n)) + + _, err = sp.Read() + if err != io.EOF { + t.Fatalf("io.EOF expected, but got %v", err) + } +} + +func TestStreamParser_Success2(t *testing.T) { + s := ` + + c1 + b1 + d1 + b2 + c2 + ` + + sp, err := CreateStreamParser(strings.NewReader(s), "/AAA/CCC | /AAA/DDD") + if err != nil { + t.Fatal(err.Error()) + } + + // First Read() should return c1 + n, err := sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "first call result", `c1`, n) + testOutputXML(t, "doc after first call", `<>c1`, root(n)) + + // Second Read() should return d1 + n, err = sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "second call result", `d1`, n) + testOutputXML(t, "doc after second call", + `<>b1d1`, root(n)) + + // Third call should return c2 + n, err = sp.Read() + if err != nil { + t.Fatal(err.Error()) + } + testOutputXML(t, "third call result", `c2`, n) + testOutputXML(t, "doc after third call", + `<>b1b2c2`, root(n)) + + _, err = sp.Read() + if err != io.EOF { + t.Fatalf("io.EOF expected, but got %v", err) + } +}