diff --git a/README.md b/README.md
index 8b3c35e..410ae44 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@ Overview
Change Logs
===
+2020-08-??
+- Add XML stream loading and parsing support.
+
2019-11-11
- Add XPath query caching.
@@ -48,26 +51,58 @@ if err != nil {
}
```
-#### Parse a XML from URL.
+#### Parse an XML from URL.
```go
doc, err := xmlquery.LoadURL("http://www.example.com/sitemap.xml")
```
-#### Parse a XML from string.
+#### Parse an XML from string.
```go
s := ``
doc, err := xmlquery.Parse(strings.NewReader(s))
```
-#### Parse a XML from io.Reader.
+#### Parse an XML from io.Reader.
```go
f, err := os.Open("../books.xml")
doc, err := xmlquery.Parse(f)
```
+#### Parse an XML in a stream fashion (simple case without element filtering).
+
+```go
+f, err := os.Open("../books.xml")
+p, err := xmlquery.CreateStreamParser(f, "/bookstore/book")
+for {
+ n, err := p.Read()
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ ...
+ }
+}
+```
+
+#### Parse an XML in a stream fashion (simple case advanced element filtering).
+
+```go
+f, err := os.Open("../books.xml")
+p, err := xmlquery.CreateStreamParser(f, "/bookstore/book", "/bookstore/book[price>=10]")
+for {
+ n, err := p.Read()
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ ...
+ }
+}
+```
+
#### Find authors of all books in the bookstore.
```go
@@ -210,11 +245,11 @@ func main(){
List of supported XPath query packages
===
-|Name |Description |
-|--------------------------|----------------|
-|[htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document|
-|[xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document|
-|[jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document|
+| Name | Description |
+| ------------------------------------------------- | ----------------------------------------- |
+| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
+| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
+| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
Questions
===
diff --git a/parse.go b/parse.go
index 618ad2c..cf7ab39 100644
--- a/parse.go
+++ b/parse.go
@@ -3,10 +3,12 @@ package xmlquery
import (
"encoding/xml"
"errors"
+ "fmt"
"io"
"net/http"
"strings"
+ "github.com/antchfx/xpath"
"golang.org/x/net/html/charset"
)
@@ -20,12 +22,30 @@ func LoadURL(url string) (*Node, error) {
return Parse(resp.Body)
}
+// Parse returns the parse tree for the XML from the given Reader.
+func Parse(r io.Reader) (*Node, error) {
+ p := createParser(r)
+ for {
+ _, err := p.parse()
+ if err == io.EOF {
+ return p.doc, nil
+ }
+ if err != nil {
+ return nil, err
+ }
+ }
+}
+
type parser struct {
- decoder *xml.Decoder
- doc *Node
- space2prefix map[string]string
- level int
- prev *Node
+ decoder *xml.Decoder
+ doc *Node
+ space2prefix map[string]string
+ level int
+ prev *Node
+ streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s).
+ streamElementFilter *xpath.Expr // If specified, it provides a futher filtering on the target element.
+ streamNode *Node // Need to remmeber the last target node So we can clean it up upon next Read() call.
+ streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev.
}
func createParser(r io.Reader) *parser {
@@ -43,6 +63,8 @@ func createParser(r io.Reader) *parser {
}
func (p *parser) parse() (*Node, error) {
+ var streamElementNodeCounter int
+
for {
tok, err := p.decoder.Token()
if err != nil {
@@ -99,10 +121,54 @@ func (p *parser) parse() (*Node, error) {
}
addSibling(p.prev.Parent, node)
}
+ // If we're in the streaming mode, we need to remember the node if it is the target node
+ // so that when we finish processing the node's EndElement, we know how/what to return to
+ // caller. Also we need to remove the target node from the tree upon next Read() call so
+ // memory doesn't grow unbounded.
+ if p.streamElementXPath != nil {
+ if p.streamNode == nil {
+ if QuerySelector(p.doc, p.streamElementXPath) != nil {
+ p.streamNode = node
+ p.streamNodePrev = p.prev
+ streamElementNodeCounter = 1
+ }
+ } else {
+ streamElementNodeCounter++
+ }
+ }
p.prev = node
p.level++
case xml.EndElement:
p.level--
+ // If we're in streaming mode, and we already have a potential streaming
+ // target node identified (p.streamNode != nil) then we need to check if
+ // this is the real one we want to return to caller.
+ if p.streamNode != nil {
+ streamElementNodeCounter--
+ if streamElementNodeCounter == 0 {
+ // Now we know this element node is the at least passing the initial
+ // p.streamElementXPath check and is a potential target node candidate.
+ // We need to have 1 more check with p.streamElementFilter (if given) to
+ // ensure it is really the element node we want.
+ // The reason we need a two-step check process is because the following
+ // situation:
+ // b1
+ // And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during
+ // xml.StartElement time, the node is still empty, so it will pass
+ // the p.streamElementXPath check. However, eventually we know this
+ // shouldn't be returned to the caller. Having a second more fine-grained
+ // filter check ensures that. So in this case, the caller should really
+ // setup the stream parser with:
+ // streamElementXPath = "/AAA/BBB["
+ // streamElementFilter = "/AAA/BBB[. != 'b1']"
+ if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil {
+ return p.streamNode, nil
+ }
+ // otherwise, this isn't our target node. clean things up.
+ p.streamNode = nil
+ p.streamNodePrev = nil
+ }
+ }
case xml.CharData:
node := &Node{Type: CharDataNode, Data: string(tok), level: p.level}
if p.level == p.prev.level {
@@ -150,16 +216,87 @@ func (p *parser) parse() (*Node, error) {
}
}
-// Parse returns the parse tree for the XML from the given Reader.
-func Parse(r io.Reader) (*Node, error) {
- p := createParser(r)
- for {
- _, err := p.parse()
- if err == io.EOF {
- return p.doc, nil
- }
+// StreamParser enables loading and parsing an XML document in a streaming fashion.
+type StreamParser struct {
+ p *parser
+}
+
+// CreateStreamParser creates a StreamParser. Argument streamElementXPath is required.
+// Argument streamElementFilter is optional and should only be used in advanced scenarios.
+//
+// Scenario 1: simple case:
+// xml := `b1b2`
+// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB")
+// if err != nil {
+// panic(err)
+// }
+// for {
+// n, err := sp.Read()
+// if err != nil {
+// break
+// }
+// fmt.Println(n.OutputXML(true))
+// }
+// Output will be:
+// b1
+// b2
+//
+// Scenario 2: advanced case:
+// xml := `b1b2`
+// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']")
+// if err != nil {
+// panic(err)
+// }
+// for {
+// n, err := sp.Read()
+// if err != nil {
+// break
+// }
+// fmt.Println(n.OutputXML(true))
+// }
+// Output will be:
+// b2
+//
+// As the argument names indicate, streamElementXPath should be used for providing xpath query pointing
+// to the target element node only, no extra filtering on the element itself or its children; while
+// streamElementFilter, if needed, can provide additional filtering on the target element and its children.
+//
+// CreateStreamParser returns error if either streamElementXPath or streamElementFilter, if provided, cannot
+// be successfully parsed and compiled into a valid xpath query.
+func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) {
+ elemXPath, err := getQuery(streamElementXPath)
+ if err != nil {
+ return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error())
+ }
+ elemFilter := (*xpath.Expr)(nil)
+ if len(streamElementFilter) > 0 {
+ elemFilter, err = getQuery(streamElementFilter[0])
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error())
}
}
+ sp := &StreamParser{
+ p: createParser(r),
+ }
+ sp.p.streamElementXPath = elemXPath
+ sp.p.streamElementFilter = elemFilter
+ return sp, nil
+}
+
+// Read returns a target node that satisifies the XPath specified by caller at StreamParser creation
+// time. If there is no more satisifying target node after reading the rest of the XML document, io.EOF
+// will be returned. At any time, any XML parsing error encountered, the error will be returned and
+// the stream parsing is stopped. Calling Read() after an error is returned (including io.EOF) is not
+// allowed the behavior will be undefined. Also note, due to the streaming nature, calling Read() will
+// automatically remove any previous target node(s) from the document tree.
+func (sp *StreamParser) Read() (*Node, error) {
+ // Because this is a streaming read, we need to release/remove last
+ // target node from the node tree to free up memory.
+ if sp.p.streamNode != nil {
+ removeFromTree(sp.p.streamNode)
+ sp.p.prev = sp.p.streamNodePrev
+ sp.p.streamNode = nil
+ sp.p.streamNodePrev = nil
+ }
+ return sp.p.parse()
}
diff --git a/parse_test.go b/parse_test.go
index 8c1f547..894ab51 100644
--- a/parse_test.go
+++ b/parse_test.go
@@ -1,6 +1,7 @@
package xmlquery
import (
+ "io"
"net/http"
"net/http/httptest"
"strings"
@@ -250,3 +251,146 @@ func TestCharData(t *testing.T) {
testValue(t, cdata.InnerText(), "Richard Lawler")
}
+
+func TestStreamParser_InvalidXPath(t *testing.T) {
+ sp, err := CreateStreamParser(strings.NewReader(""), "[invalid")
+ if err == nil || err.Error() != "invalid streamElementXPath '[invalid', err: expression must evaluate to a node-set" {
+ t.Fatalf("got non-expected error: %v", err)
+ }
+ if sp != nil {
+ t.Fatal("expected nil for sp, but got none-nil value")
+ }
+
+ sp, err = CreateStreamParser(strings.NewReader(""), ".", "[invalid")
+ if err == nil || err.Error() != "invalid streamElementFilter '[invalid', err: expression must evaluate to a node-set" {
+ t.Fatalf("got non-expected error: %v", err)
+ }
+ if sp != nil {
+ t.Fatal("expected nil for sp, but got none-nil value")
+ }
+}
+
+func root(n *Node) *Node {
+ if n == nil {
+ return nil
+ }
+ for ; n.Parent != nil; n = n.Parent {
+ }
+ return n
+}
+
+func testOutputXML(t *testing.T, msg string, expectedXML string, n *Node) {
+ if n.OutputXML(true) != expectedXML {
+ t.Fatalf("%s, expected XML: '%s', actual: '%s'", msg, expectedXML, n.OutputXML(true))
+ }
+}
+
+func TestStreamParser_Success1(t *testing.T) {
+ s := `
+
+ c1
+ b1
+ d1
+ b2z1
+ b3
+ b4
+ b5
+ c3
+ `
+
+ sp, err := CreateStreamParser(strings.NewReader(s), "/AAA/BBB", "/AAA/BBB[. != 'b3']")
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+
+ // First `` read
+ n, err := sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "first call result", `b1`, n)
+ testOutputXML(t, "doc after first call", `<>c1b1>`, root(n))
+
+ // Second `` read
+ n, err = sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "second call result", `b2z1`, n)
+ testOutputXML(t, "doc after second call",
+ `<>c1d1b2z1>`, root(n))
+
+ // Third `` read (Note we will skip 'b3' since the streamElementFilter excludes it)
+ n, err = sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "third call result", `b4`, n)
+ // Note the inclusion of `b3` in the document? This is because `b3` has
+ // been filtered out and is not our target node, thus it is considered just like any other
+ // non target nodes such as ``` or ``
+ testOutputXML(t, "doc after third call",
+ `<>c1d1b3b4>`, root(n))
+
+ // Fourth `` read
+ n, err = sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "fourth call result", `b5`, n)
+ // Note the inclusion of `b3` in the document.
+ testOutputXML(t, "doc after fourth call",
+ `<>c1d1b3b5>`, root(n))
+
+ _, err = sp.Read()
+ if err != io.EOF {
+ t.Fatalf("io.EOF expected, but got %v", err)
+ }
+}
+
+func TestStreamParser_Success2(t *testing.T) {
+ s := `
+
+ c1
+ b1
+ d1
+ b2
+ c2
+ `
+
+ sp, err := CreateStreamParser(strings.NewReader(s), "/AAA/CCC | /AAA/DDD")
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+
+ // First Read() should return c1
+ n, err := sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "first call result", `c1`, n)
+ testOutputXML(t, "doc after first call", `<>c1>`, root(n))
+
+ // Second Read() should return d1
+ n, err = sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "second call result", `d1`, n)
+ testOutputXML(t, "doc after second call",
+ `<>b1d1>`, root(n))
+
+ // Third call should return c2
+ n, err = sp.Read()
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ testOutputXML(t, "third call result", `c2`, n)
+ testOutputXML(t, "doc after third call",
+ `<>b1b2c2>`, root(n))
+
+ _, err = sp.Read()
+ if err != io.EOF {
+ t.Fatalf("io.EOF expected, but got %v", err)
+ }
+}