diff --git a/README.md b/README.md index 60ee2e6..f92514d 100644 --- a/README.md +++ b/README.md @@ -116,3 +116,16 @@ You can play with the `xq` utility using the Dockerized environment: docker-compose run --rm xq xq /opt/examples/xml/unformatted.xml ``` + +Output the result as JSON: + +``` +cat test/data/xml/unformatted.xml | xq -j +``` + +This will output the result in JSON format, preserving the XML structure. The JSON output will be an object where: +- XML elements become object keys +- Attributes are prefixed with "@" +- Text content is stored under "#text" if the element has attributes or child elements +- Repeated elements are automatically converted to arrays +- Elements with only text content are represented as strings diff --git a/cmd/root.go b/cmd/root.go index 65bc3aa..3ac373e 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -2,15 +2,18 @@ package cmd import ( "bytes" + "encoding/json" "errors" "fmt" - "github.com/sibprogrammer/xq/internal/utils" - "github.com/spf13/cobra" - "github.com/spf13/pflag" "io" "os" "path" "strings" + + "github.com/antchfx/xmlquery" + "github.com/sibprogrammer/xq/internal/utils" + "github.com/spf13/cobra" + "github.com/spf13/pflag" ) // Version information @@ -41,6 +44,7 @@ func NewRootCmd() *cobra.Command { reader = os.Stdin } else { + var err error if reader, err = os.Open(args[len(args)-1]); err != nil { return err } @@ -61,14 +65,16 @@ func NewRootCmd() *cobra.Command { if cssAttr != "" && cssQuery == "" { return errors.New("query option (-q) is missed for attribute selection") } + jsonOutputMode, _ := cmd.Flags().GetBool("json") pr, pw := io.Pipe() + errChan := make(chan error, 1) go func() { - defer func() { - _ = pw.Close() - }() + defer close(errChan) + defer pw.Close() + var err error if xPathQuery != "" { err = utils.XPathQuery(reader, pw, xPathQuery, singleNode, options) } else if cssQuery != "" { @@ -76,26 +82,30 @@ func NewRootCmd() *cobra.Command { } else { var contentType utils.ContentType contentType, reader = detectFormat(cmd.Flags(), reader) - - switch contentType { - case utils.ContentHtml: - err = utils.FormatHtml(reader, pw, indent, colors) - case utils.ContentXml: - err = utils.FormatXml(reader, pw, indent, colors) - case utils.ContentJson: - err = utils.FormatJson(reader, pw, indent, colors) - default: - err = fmt.Errorf("unknown content type: %v", contentType) + if jsonOutputMode { + err = processAsJSON(cmd.Flags(), reader, pw, contentType) + } else { + switch contentType { + case utils.ContentHtml: + err = utils.FormatHtml(reader, pw, indent, colors) + case utils.ContentXml: + err = utils.FormatXml(reader, pw, indent, colors) + case utils.ContentJson: + err = utils.FormatJson(reader, pw, indent, colors) + default: + err = fmt.Errorf("unknown content type: %v", contentType) + } } } - if err != nil { - fmt.Println("Error:", err) - os.Exit(1) - } + errChan <- err }() - return utils.PagerPrint(pr, cmd.OutOrStdout()) + if err := utils.PagerPrint(pr, cmd.OutOrStdout()); err != nil { + return err + } + + return <-errChan }, } } @@ -127,6 +137,9 @@ func InitFlags(cmd *cobra.Command) { "Extract an attribute value instead of node content for provided CSS query") cmd.PersistentFlags().BoolP("node", "n", utils.GetConfig().Node, "Return the node content instead of text") + cmd.PersistentFlags().BoolP("json", "j", false, "Output the result as JSON") + cmd.PersistentFlags().Bool("compact", false, "Compact JSON output (no indentation)") + cmd.PersistentFlags().IntP("depth", "d", -1, "Maximum nesting depth for JSON output (-1 for unlimited)") } func Execute() { @@ -193,7 +206,7 @@ func detectFormat(flags *pflag.FlagSet, origReader io.Reader) (utils.ContentType return utils.ContentHtml, origReader } - buf := make([]byte, 10) + buf := make([]byte, 20) length, err := origReader.Read(buf) if err != nil { return utils.ContentText, origReader @@ -211,3 +224,54 @@ func detectFormat(flags *pflag.FlagSet, origReader io.Reader) (utils.ContentType return utils.ContentXml, reader } + +func processAsJSON(flags *pflag.FlagSet, reader io.Reader, w io.Writer, contentType utils.ContentType) error { + var ( + jsonCompact bool + jsonDepth int + result interface{} + ) + jsonCompact, _ = flags.GetBool("compact") + if flags.Changed("depth") { + jsonDepth, _ = flags.GetInt("depth") + } else { + jsonDepth = -1 + } + + switch contentType { + case utils.ContentXml, utils.ContentHtml: + doc, err := xmlquery.Parse(reader) + if err != nil { + return fmt.Errorf("error while parsing XML: %w", err) + } + result = utils.NodeToJSON(doc, jsonDepth) + case utils.ContentJson: + decoder := json.NewDecoder(reader) + if err := decoder.Decode(&result); err != nil { + return fmt.Errorf("error while parsing JSON: %w", err) + } + default: + // Treat as plain text + content, err := io.ReadAll(reader) + if err != nil { + return fmt.Errorf("error while reading content: %w", err) + } + result = map[string]interface{}{ + "text": string(content), + } + } + + var encoder *json.Encoder + if jsonCompact { + encoder = json.NewEncoder(w) + } else { + encoder = json.NewEncoder(w) + encoder.SetIndent("", " ") + } + + if err := encoder.Encode(result); err != nil { + return fmt.Errorf("error while encoding JSON: %v", err) + } + + return nil +} diff --git a/cmd/root_test.go b/cmd/root_test.go index a22fd48..ac0f31e 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -2,11 +2,16 @@ package cmd import ( "bytes" - "github.com/spf13/cobra" - "github.com/stretchr/testify/assert" + "encoding/json" + "fmt" "path" "strings" "testing" + + "github.com/sibprogrammer/xq/internal/utils" + "github.com/spf13/cobra" + "github.com/spf13/pflag" + "github.com/stretchr/testify/assert" ) func execute(cmd *cobra.Command, args ...string) (string, error) { @@ -87,3 +92,101 @@ func TestRootCmd(t *testing.T) { _, err = execute(command, "--indent", "incorrect", xmlFilePath) assert.ErrorContains(t, err, "invalid argument") } + +func TestProcessAsJSON(t *testing.T) { + tests := []struct { + name string + input string + contentType utils.ContentType + flags map[string]interface{} + expected map[string]interface{} + wantErr bool + }{ + { + name: "Simple XML", + input: "value", + contentType: utils.ContentXml, + expected: map[string]interface{}{ + "root": map[string]interface{}{ + "child": "value", + }, + }, + }, + {name: "Simple JSON", + input: `{"root": {"child": "value"}}`, + contentType: utils.ContentJson, + expected: map[string]interface{}{ + "root": map[string]interface{}{ + "child": "value", + }, + }, + }, + { + name: "Simple HTML", + input: "

text

", + contentType: utils.ContentHtml, + expected: map[string]interface{}{ + "html": map[string]interface{}{ + "body": map[string]interface{}{ + "p": "text", + }, + }, + }, + }, + { + name: "Plain text", + input: "text", + contentType: utils.ContentText, + expected: map[string]interface{}{ + "text": "text", + }, + }, + { + name: "invalid input", + input: "thinking>\nI'll analyze each command and its output:\n", + wantErr: true, + }, + { + name: "combined", + expected: map[string]interface{}{ + "#text": "Thank you\nBye.", + "thinking": "1. woop", + }, + input: `Thank you + +1. woop + + +Bye.`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Set up flags + flags := pflag.NewFlagSet("test", pflag.ContinueOnError) + flags.Bool("compact", false, "") + flags.Int("depth", -1, "") + for name, v := range tt.flags { + _ = flags.Set(name, fmt.Sprint(v)) + } + + reader := strings.NewReader(tt.input) + var output bytes.Buffer + + err := processAsJSON(flags, reader, &output, tt.contentType) + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + + var resultMap map[string]interface{} + err = json.Unmarshal(output.Bytes(), &resultMap) + assert.NoError(t, err) + + assert.Equal(t, tt.expected, resultMap) + } + }) + } +} diff --git a/go.mod b/go.mod index d825608..497a5a6 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/antchfx/xmlquery v1.4.2 github.com/antchfx/xpath v1.3.2 github.com/fatih/color v1.18.0 + github.com/google/go-cmp v0.6.0 github.com/spf13/cobra v1.8.1 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.9.0 diff --git a/go.sum b/go.sum index 7d5cd88..9d041b0 100644 --- a/go.sum +++ b/go.sum @@ -14,6 +14,8 @@ github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= diff --git a/internal/utils/contenttype_string.go b/internal/utils/contenttype_string.go new file mode 100644 index 0000000..a1d3f07 --- /dev/null +++ b/internal/utils/contenttype_string.go @@ -0,0 +1,26 @@ +// Code generated by "stringer -type ContentType"; DO NOT EDIT. + +package utils + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[ContentXml-0] + _ = x[ContentHtml-1] + _ = x[ContentJson-2] + _ = x[ContentText-3] +} + +const _ContentType_name = "ContentXmlContentHtmlContentJsonContentText" + +var _ContentType_index = [...]uint8{0, 10, 21, 32, 43} + +func (i ContentType) String() string { + if i < 0 || i >= ContentType(len(_ContentType_index)-1) { + return "ContentType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _ContentType_name[_ContentType_index[i]:_ContentType_index[i+1]] +} diff --git a/internal/utils/gen.go b/internal/utils/gen.go new file mode 100644 index 0000000..28040f6 --- /dev/null +++ b/internal/utils/gen.go @@ -0,0 +1,2 @@ +//go:generate go run golang.org/x/tools/cmd/stringer@latest -type=ContentType +package utils diff --git a/internal/utils/jsonutil.go b/internal/utils/jsonutil.go new file mode 100644 index 0000000..cbbd868 --- /dev/null +++ b/internal/utils/jsonutil.go @@ -0,0 +1,125 @@ +package utils + +import ( + "strings" + + "github.com/antchfx/xmlquery" +) + +// NodeToJSON converts an xmlquery.Node to a JSON object. The depth parameter +// specifies how many levels of children to include in the result. A depth of 0 means +// only the text content of the node is included. A depth of 1 means the node's children +// are included, but not their children, and so on. +func NodeToJSON(node *xmlquery.Node, depth int) interface{} { + if node == nil { + return nil + } + + switch node.Type { + case xmlquery.DocumentNode: + result := make(map[string]interface{}) + var textParts []string + + // Process the next sibling of the document node first (if any) + if node.NextSibling != nil && node.NextSibling.Type == xmlquery.TextNode { + text := strings.TrimSpace(node.NextSibling.Data) + if text != "" { + textParts = append(textParts, text) + } + } + + // Process all children, including siblings of the first child + for child := node.FirstChild; child != nil; child = child.NextSibling { + switch child.Type { + case xmlquery.ElementNode: + childResult := nodeToJSONInternal(child, depth) + result[child.Data] = childResult + case xmlquery.TextNode: + text := strings.TrimSpace(child.Data) + if text != "" { + textParts = append(textParts, text) + } + } + } + + if len(textParts) > 0 { + result["#text"] = strings.Join(textParts, "\n") + } + return result + + case xmlquery.ElementNode: + return nodeToJSONInternal(node, depth) + + case xmlquery.TextNode: + return strings.TrimSpace(node.Data) + + default: + return nil + } +} + +func nodeToJSONInternal(node *xmlquery.Node, depth int) interface{} { + if depth == 0 { + return getTextContent(node) + } + + result := make(map[string]interface{}) + for _, attr := range node.Attr { + result["@"+attr.Name.Local] = attr.Value + } + + var textParts []string + for child := node.FirstChild; child != nil; child = child.NextSibling { + switch child.Type { + case xmlquery.TextNode: + text := strings.TrimSpace(child.Data) + if text != "" { + textParts = append(textParts, text) + } + case xmlquery.ElementNode: + childResult := nodeToJSONInternal(child, depth-1) + addToResult(result, child.Data, childResult) + } + } + + if len(textParts) > 0 { + if len(result) == 0 { + return strings.Join(textParts, "\n") + } + result["#text"] = strings.Join(textParts, "\n") + } + + return result +} + +func getTextContent(node *xmlquery.Node) string { + var parts []string + for child := node.FirstChild; child != nil; child = child.NextSibling { + switch child.Type { + case xmlquery.TextNode: + text := strings.TrimSpace(child.Data) + if text != "" { + parts = append(parts, text) + } + case xmlquery.ElementNode: + parts = append(parts, getTextContent(child)) + } + } + return strings.Join(parts, "\n") +} + +func addToResult(result map[string]interface{}, key string, value interface{}) { + if key == "" { + return + } + if existing, ok := result[key]; ok { + switch existing := existing.(type) { + case []interface{}: + result[key] = append(existing, value) + default: + result[key] = []interface{}{existing, value} + } + } else { + result[key] = value + } +} diff --git a/internal/utils/jsonutil_test.go b/internal/utils/jsonutil_test.go new file mode 100644 index 0000000..d627ade --- /dev/null +++ b/internal/utils/jsonutil_test.go @@ -0,0 +1,96 @@ +package utils + +import ( + "encoding/json" + "strings" + "testing" + + "github.com/antchfx/xmlquery" + "github.com/google/go-cmp/cmp" +) + +func TestNodeToJSON(t *testing.T) { + tests := []struct { + name string + input string + depth int + expected string + }{ + { + name: "Simple XML", + input: "value", + depth: -1, + expected: `{"root":{"child":"value"}}`, + }, + { + name: "XML with attributes", + input: "text", + depth: -1, + expected: `{"root":{"@attr":"value","child":"text"}}`, + }, + { + name: "XML with mixed content", + input: "\n text value\n more text\n", + depth: -1, + expected: `{"root":{"#text":"text\nmore text","child":"value"}}`, + }, + { + name: "Depth limited XML", + input: "valuetext", + depth: 2, + expected: `{"root":{"child1":{"grandchild":"value"},"child2":"text"}}`, + }, + { + name: "Depth 1 XML", + input: "valuetext", + depth: 1, + expected: `{"root":{"child1":"value","child2":"text"}}`, + }, + { + name: "Depth 0 XML", + input: "valuetext", + depth: 0, + expected: `{"root":"value\ntext"}`, + }, + { + name: "mixed text and xml", + input: `Thank you + +1. woop + + +Bye`, + expected: `{"#text":"Thank you\nBye","thinking":"1. woop"}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + doc, err := xmlquery.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Fatalf("Failed to parse XML: %v", err) + } + + result := NodeToJSON(doc, tt.depth) + resultJSON, err := json.Marshal(result) + if err != nil { + t.Fatalf("Failed to marshal result to JSON: %v", err) + } + + var resultMap, expectedMap map[string]interface{} + err = json.Unmarshal(resultJSON, &resultMap) + if err != nil { + t.Fatalf("Failed to unmarshal result JSON: %v", err) + } + err = json.Unmarshal([]byte(tt.expected), &expectedMap) + if err != nil { + t.Fatalf("Failed to unmarshal expected JSON: %v", err) + } + + t.Log(string(resultJSON)) + if diff := cmp.Diff(expectedMap, resultMap); diff != "" { + t.Errorf("NodeToJSON mismatch (-want +got):\n%s", diff) + } + }) + } +}