hypermodeinc · codexnull · Dec 17, 2018 · Nov 15, 2018 · Nov 16, 2018 · Nov 16, 2018
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,9 @@ gql/fuzz-data/suppressions
 .idea
 dgraph.iml
 
+# vim session backups
+.*.swp
+
 # Binaries for programs and plugins
 *.exe
 *.exe~

diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go
@@ -22,6 +22,7 @@ import (
 	"compress/gzip"
 	"context"
 	"fmt"
+	"github.com/pkg/errors"
 	"io"
 	"io/ioutil"
 	"os"
@@ -30,6 +31,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode"
 
 	"github.com/dgraph-io/badger"
 	bo "github.com/dgraph-io/badger/options"
@@ -42,6 +44,7 @@ import (
 
 type options struct {
 	RDFDir        string
+	JSONDir       string
 	SchemaFile    string
 	DgraphsDir    string
 	TmpDir        string
@@ -63,16 +66,21 @@ type options struct {
 	shardOutputDirs []string
 }
 
+const (
+	rdfLoader int = iota
+	jsonLoader
+)
+
 type state struct {
-	opt        options
-	prog       *progress
-	xids       *xidmap.XidMap
-	schema     *schemaStore
-	shards     *shardMap
-	rdfChunkCh chan *bytes.Buffer
-	mapFileId  uint32 // Used atomically to name the output files of the mappers.
-	dbs        []*badger.DB
-	writeTs    uint64 // All badger writes use this timestamp
+	opt           options
+	prog          *progress
+	xids          *xidmap.XidMap
+	schema        *schemaStore
+	shards        *shardMap
+	readerChunkCh chan *bytes.Buffer
+	mapFileId     uint32 // Used atomically to name the output files of the mappers.
+	dbs           []*badger.DB
+	writeTs       uint64 // All badger writes use this timestamp
 }
 
 type loader struct {
@@ -94,8 +102,8 @@ func newLoader(opt options) *loader {
 		prog:   newProgress(),
 		shards: newShardMap(opt.MapShards),
 		// Lots of gz readers, so not much channel buffer needed.
-		rdfChunkCh: make(chan *bytes.Buffer, opt.NumGoroutines),
-		writeTs:    getWriteTimestamp(zero),
+		readerChunkCh: make(chan *bytes.Buffer, opt.NumGoroutines),
+		writeTs:       getWriteTimestamp(zero),
 	}
 	st.schema = newSchemaStore(readSchema(opt.SchemaFile), opt, st)
 	ld := &loader{
@@ -142,9 +150,9 @@ func readSchema(filename string) []*pb.SchemaUpdate {
 	return initialSchema
 }
 
-func readChunk(r *bufio.Reader) (*bytes.Buffer, error) {
+func readRDFChunk(r *bufio.Reader) (*bytes.Buffer, error) {
 	batch := new(bytes.Buffer)
-	batch.Grow(10 << 20)
+	batch.Grow(1 << 20)
 	for lineCount := 0; lineCount < 1e5; lineCount++ {
 		slc, err := r.ReadSlice('\n')
 		if err == io.EOF {
@@ -174,13 +182,114 @@ func readChunk(r *bufio.Reader) (*bytes.Buffer, error) {
 	return batch, nil
 }
 
-func findRDFFiles(dir string) []string {
+func skipSpace(r *bufio.Reader) error {
+	ch, _, err := r.ReadRune()
+	for unicode.IsSpace(ch) {
+		ch, _, err = r.ReadRune()
+	}
+
+	if err == nil {
+		err = r.UnreadRune()
+	}
+
+	return nil
+}
+func readJSONPreChunk(r *bufio.Reader) error {
+	// The JSON file to load must be an array of maps (that is, '[ { ... }, { ... }, ... ]').
+	// This function must be called before calling readJSONChunk for the first time to advance
+	// the Reader past the array start token ('[') so that calls to readJSONChunk can read
+	// one array element at a time instead of having to read the entire array into memory.
+	if err := skipSpace(r); err != nil {
+		return err
+	}
+
+	ch, _, err := r.ReadRune()
+	if err != nil && err != io.EOF {
+		return err
+	} else if ch != '[' {
+		return errors.New("json file must contain array")
+	}
+
+	return nil
+}
+
+func readJSONChunk(r *bufio.Reader) (*bytes.Buffer, error) {
+	batch := new(bytes.Buffer)
+	batch.Grow(1 << 20)
+
+	// For RDF, the loader just reads the input and the mapper parses it into nquads,
+	// so do the same for JSON. But since JSON is not line-oriented like RDF, it's a little
+	// more complicated to ensure a complete JSON structure is read.
+
+	if err := skipSpace(r); err != nil {
+		return nil, err
+	}
+
+	ch, _, err := r.ReadRune()
+	if err == io.EOF {
+		return batch, err
+	} else if ch != '{' {
+		return nil, errors.New("expected json map start")
+	}
+
+	// Just find the matching closing brace. Let the JSON-to-nquad parser in the mapper worry
+	// about whether everything in between is valid JSON or not.
+	depth := 0
+	quoted := false
+	done := false
+	var pch rune
+	for !done {
+		batch.WriteRune(ch)
+		pch = ch
+
+		ch, _, err = r.ReadRune()
+		if err != nil {
+			// any error at this point, even EOF, is fatal
+			return nil, errors.New("malformed json")
+		}
+
+		switch ch {
+		case '{':
+			if !quoted {
+				depth++
+			}
+		case '}':
+			if !quoted {
+				if depth == 0 {
+					batch.WriteRune(ch)
+					done = true
+				} else {
+					depth--
+				}
+			}
+		case '"':
+			if !quoted || (quoted && pch != '\\') {
+				quoted = !quoted
+			}
+		}
+	}
+
+	// The map should be followed by either the ',' between array elements, or the ']'
+	// at the end of the array.
+	_ = skipSpace(r)
+	ch, _, err = r.ReadRune()
+	if ch == ']' {
+		err = io.EOF
+	} else if ch != ',' {
+		// Let next call to this function report the error.
+		_ = r.UnreadRune()
+	}
+
+	return batch, nil
+}
+
+func findDataFiles(dir string, ext string) []string {
 	var files []string
 	x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
-		if strings.HasSuffix(path, ".rdf") || strings.HasSuffix(path, ".rdf.gz") {
+		if strings.HasSuffix(path, ext) || strings.HasSuffix(path, ext+".gz") {
 			files = append(files, path)
 		}
 		return nil
@@ -211,59 +320,82 @@ func (ld *loader) mapStage() {
 		LRUSize:   1 << 19,
 	})
 
+	var files []string
+	var ext string
+	var loaderType int
+	var chunker func(r *bufio.Reader) (*bytes.Buffer, error)
+	if ld.opt.RDFDir != "" {
+		loaderType = rdfLoader
+		ext = ".rdf"
+		files = findDataFiles(ld.opt.RDFDir, ext)
+		chunker = readRDFChunk
+	} else {
+		loaderType = jsonLoader
+		ext = ".json"
+		files = findDataFiles(ld.opt.JSONDir, ext)
+		chunker = readJSONChunk
+	}
+
 	readers := make(map[string]*bufio.Reader)
-	for _, rdfFile := range findRDFFiles(ld.opt.RDFDir) {
-		f, err := os.Open(rdfFile)
+	for _, file := range files {
+		f, err := os.Open(file)
 		x.Check(err)
 		defer f.Close()
-		if !strings.HasSuffix(rdfFile, ".gz") {
-			readers[rdfFile] = bufio.NewReaderSize(f, 1<<20)
+		// TODO detect compressed input instead of relying on filename
+		//      so data can be streamed in
+		if !strings.HasSuffix(file, ".gz") {
+			readers[file] = bufio.NewReaderSize(f, 1<<20)
 		} else {
 			gzr, err := gzip.NewReader(f)
-			x.Checkf(err, "Could not create gzip reader for RDF file %q.", rdfFile)
-			readers[rdfFile] = bufio.NewReader(gzr)
+			x.Checkf(err, "Could not create gzip reader for file %q.", file)
+			readers[file] = bufio.NewReader(gzr)
 		}
 	}
 
 	if len(readers) == 0 {
-		fmt.Println("No rdf files found.")
+		fmt.Printf("No *.%s files found.\n", ext)
 		os.Exit(1)
 	}
 
 	var mapperWg sync.WaitGroup
 	mapperWg.Add(len(ld.mappers))
 	for _, m := range ld.mappers {
 		go func(m *mapper) {
-			m.run()
+			m.run(loaderType)
 			mapperWg.Done()
 		}(m)
 	}
 
 	// This is the main map loop.
 	thr := x.NewThrottle(ld.opt.NumGoroutines)
 	var fileCount int
-	for rdfFile, r := range readers {
+	for file, r := range readers {
 		thr.Start()
 		fileCount++
-		fmt.Printf("Processing file (%d out of %d): %s\n", fileCount, len(readers), rdfFile)
+		fmt.Printf("Processing file (%d out of %d): %s\n", fileCount, len(readers), file)
 		go func(r *bufio.Reader) {
 			defer thr.Done()
+			if loaderType == jsonLoader {
+				if err := readJSONPreChunk(r); err != nil {
+					x.Check(err)
+				}
+			}
 			for {
-				chunkBuf, err := readChunk(r)
+				chunkBuf, err := chunker(r)
 				if err == io.EOF {
 					if chunkBuf.Len() != 0 {
-						ld.rdfChunkCh <- chunkBuf
+						ld.readerChunkCh <- chunkBuf
 					}
 					break
 				}
 				x.Check(err)
-				ld.rdfChunkCh <- chunkBuf
+				ld.readerChunkCh <- chunkBuf
 			}
 		}(r)
 	}
 	thr.Wait()
 
-	close(ld.rdfChunkCh)
+	close(ld.readerChunkCh)
 	mapperWg.Wait()
 
 	// Allow memory to GC before the reduce phase.

diff --git a/dgraph/cmd/bulk/mapper.go b/dgraph/cmd/bulk/mapper.go
@@ -20,6 +20,7 @@ import (
 	"bytes"
 	"encoding/binary"
 	"fmt"
+	"github.com/dgraph-io/dgraph/edgraph"
 	"io"
 	"log"
 	"math"
@@ -118,28 +119,54 @@ func (m *mapper) writeMapEntriesToFile(entriesBuf []byte, shardIdx int) {
 	x.Check(x.WriteFileSync(filename, entriesBuf, 0644))
 }
 
-func (m *mapper) run() {
-	for chunkBuf := range m.rdfChunkCh {
+func (m *mapper) run(loaderType int) {
+	for chunkBuf := range m.readerChunkCh {
 		done := false
 		for !done {
-			rdf, err := chunkBuf.ReadString('\n')
-			if err == io.EOF {
-				// Process the last RDF rather than breaking immediately.
-				done = true
+			if loaderType == rdfLoader {
+				str, err := chunkBuf.ReadString('\n')
+				if err == io.EOF {
+					// Process the last chunk rather than breaking immediately.
+					done = true
+				} else {
+					x.Check(err)
+				}
+				str = strings.TrimSpace(str)
+
+				// process RDF line
+				if err := m.processRDF(str); err != nil {
+					atomic.AddInt64(&m.prog.errCount, 1)
+					if !m.opt.IgnoreErrors {
+						x.Check(err)
+					}
+				}
+
+				atomic.AddInt64(&m.prog.rdfCount, 1)
 			} else {
-				x.Check(err)
-			}
-			rdf = strings.TrimSpace(rdf)
+				// process JSON chunk
+				str, err := chunkBuf.ReadBytes(0)
+				nquads, err := edgraph.NquadsFromJson(str)
+				if err == io.EOF {
+					done = true
+				} else if err != nil {
+					atomic.AddInt64(&m.prog.errCount, 1)
+					if !m.opt.IgnoreErrors {
+						x.Check(err)
+					}
+				}
 
-			// process RDF line
-			if err := m.processRDF(rdf); err != nil {
-				atomic.AddInt64(&m.prog.errCount, 1)
-				if !m.opt.IgnoreErrors {
-					x.Check(err)
+				for _, nq := range nquads {
+					if err := facets.SortAndValidate(nq.Facets); err != nil {
+						if !m.opt.IgnoreErrors {
+							x.Check(err)
+						}
+					}
+
+					m.processNQuad(gql.NQuad{NQuad: nq})
+					atomic.AddInt64(&m.prog.rdfCount, 1)
 				}
 			}
 
-			atomic.AddInt64(&m.prog.rdfCount, 1)
 			for i := range m.shards {
 				sh := &m.shards[i]
 				if len(sh.entriesBuf) >= int(m.opt.MapBufSize) {
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,9 @@ gql/fuzz-data/suppressions @@
     .idea
     dgraph.iml
+    # vim session backups
+    .*.swp
     # Binaries for programs and plugins
     *.exe
     *.exe~
@@ Expand Down @@