dgraph-io · codexnull · Feb 6, 2019 · Jan 12, 2019 · Jan 12, 2019 · Jan 14, 2019
diff --git a/contrib/scripts/functions.sh b/contrib/scripts/functions.sh
@@ -16,7 +16,7 @@ function restartCluster {
   echo "Rebuilding dgraph ..."
   make install
   docker ps -a --filter label="cluster=test" --format "{{.Names}}" | xargs -r docker rm -f
-  docker-compose -p dgraph -f $compose_file up --force-recreate --remove-orphans --detach
+  docker-compose -p dgraph -f $compose_file up --force-recreate --remove-orphans --detach || exit 1
   popd >/dev/null
 
   $basedir/contrib/wait-for-it.sh -t 60 localhost:6080 || exit 1

diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go
@@ -17,7 +17,6 @@
 package bulk
 
 import (
-	"bufio"
 	"bytes"
 	"compress/gzip"
 	"context"
@@ -27,16 +26,17 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
-	"strings"
 	"sync"
 	"time"
 
 	"github.com/dgraph-io/badger"
 	bo "github.com/dgraph-io/badger/options"
+	"github.com/dgraph-io/dgraph/loadfile"
 	"github.com/dgraph-io/dgraph/protos/pb"
 	"github.com/dgraph-io/dgraph/schema"
 	"github.com/dgraph-io/dgraph/x"
 	"github.com/dgraph-io/dgraph/xidmap"
+
 	"google.golang.org/grpc"
 )
 
@@ -58,11 +58,13 @@ type options struct {
 	HttpAddr         string
 	IgnoreErrors     bool
 	CustomTokenizers string
+	KeyFields        string
 
 	MapShards    int
 	ReduceShards int
 
 	shardOutputDirs []string
+	parsedKeyFields []string
 }
 
 type state struct {
@@ -144,25 +146,6 @@ func readSchema(filename string) []*pb.SchemaUpdate {
 	return initialSchema
 }
 
-func findDataFiles(dir string, ext string) []string {
-	var files []string
-	x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
-		if err != nil {
-			return err
-		}
-		if strings.HasSuffix(path, ext) || strings.HasSuffix(path, ext+".gz") {
-			files = append(files, path)
-		}
-		return nil
-	}))
-	return files
-}
-
-type uidRangeResponse struct {
-	uids *pb.AssignedIds
-	err  error
-}
-
 func (ld *loader) mapStage() {
 	ld.prog.setPhase(mapPhase)
 
@@ -181,29 +164,29 @@ func (ld *loader) mapStage() {
 		LRUSize:   1 << 19,
 	})
 
-	var files []string
-	var ext string
+	var dir, ext string
 	var loaderType int
 	if ld.opt.RDFDir != "" {
-		loaderType = rdfInput
+		loaderType = loadfile.RdfInput
+		dir = ld.opt.RDFDir
 		ext = ".rdf"
-		files = findDataFiles(ld.opt.RDFDir, ext)
 	} else {
-		loaderType = jsonInput
+		loaderType = loadfile.JsonInput
+		dir = ld.opt.JSONDir
 		ext = ".json"
-		files = findDataFiles(ld.opt.JSONDir, ext)
-	}
 
+	}
+	files := x.FindDataFiles(dir, []string{ext, ext + ".gz"})
 	if len(files) == 0 {
-		fmt.Printf("No *%s files found.\n", ext)
+		fmt.Printf("No *%s files found under %s.\n", ext, dir)
 		os.Exit(1)
 	}
 
 	var mapperWg sync.WaitGroup
 	mapperWg.Add(len(ld.mappers))
 	for _, m := range ld.mappers {
 		go func(m *mapper) {
-			m.run(loaderType)
+			m.run(loaderType, &ld.opt.parsedKeyFields)
 			mapperWg.Done()
 		}(m)
 	}
@@ -213,25 +196,17 @@ func (ld *loader) mapStage() {
 	for i, file := range files {
 		thr.Start()
 		fmt.Printf("Processing file (%d out of %d): %s\n", i+1, len(files), file)
-		chunker := newChunker(loaderType)
+		chunker := loadfile.NewChunker(loaderType)
+
 		go func(file string) {
 			defer thr.Done()
 
-			f, err := os.Open(file)
-			x.Check(err)
-			defer f.Close()
+			r, cleanup_fn := x.FileReader(file)
+			defer cleanup_fn()
 
-			var r *bufio.Reader
-			if !strings.HasSuffix(file, ".gz") {
-				r = bufio.NewReaderSize(f, 1<<20)
-			} else {
-				gzr, err := gzip.NewReader(f)
-				x.Checkf(err, "Could not create gzip reader for file %q.", file)
-				r = bufio.NewReaderSize(gzr, 1<<20)
-			}
-			x.Check(chunker.begin(r))
+			x.Check(chunker.Begin(r))
 			for {
-				chunkBuf, err := chunker.chunk(r)
+				chunkBuf, err := chunker.Chunk(r)
 				if chunkBuf != nil && chunkBuf.Len() > 0 {
 					ld.readerChunkCh <- chunkBuf
 				}
@@ -241,7 +216,7 @@ func (ld *loader) mapStage() {
 					x.Check(err)
 				}
 			}
-			x.Check(chunker.end(r))
+			x.Check(chunker.End(r))
 		}(file)
 	}
 	thr.Wait()

diff --git a/dgraph/cmd/bulk/mapper.go b/dgraph/cmd/bulk/mapper.go
@@ -32,6 +32,7 @@ import (
 
 	"github.com/dgraph-io/dgo/protos/api"
 	"github.com/dgraph-io/dgraph/gql"
+	"github.com/dgraph-io/dgraph/loadfile"
 	"github.com/dgraph-io/dgraph/posting"
 	"github.com/dgraph-io/dgraph/protos/pb"
 	"github.com/dgraph-io/dgraph/tok"
@@ -116,12 +117,12 @@ func (m *mapper) writeMapEntriesToFile(entriesBuf []byte, shardIdx int) {
 	x.Check(x.WriteFileSync(filename, entriesBuf, 0644))
 }
 
-func (m *mapper) run(inputFormat int) {
-	chunker := newChunker(inputFormat)
+func (m *mapper) run(inputFormat int, keyFields *[]string) {
+	chunker := loadfile.NewChunker(inputFormat)
 	for chunkBuf := range m.readerChunkCh {
 		done := false
 		for !done {
-			nqs, err := chunker.parse(chunkBuf)
+			nqs, err := chunker.Parse(chunkBuf, keyFields)
 			if err == io.EOF {
 				done = true
 			} else if err != nil {
@@ -131,7 +132,12 @@ func (m *mapper) run(inputFormat int) {
 				}
 			}
 
-			for _, nq := range nqs {
+			gqlNqs := make([]gql.NQuad, len(nqs))
+			for i, nq := range nqs {
+				gqlNqs[i] = gql.NQuad{NQuad: nq}
+			}
+
+			for _, nq := range gqlNqs {
 				if err := facets.SortAndValidate(nq.Facets); err != nil {
 					atomic.AddInt64(&m.prog.errCount, 1)
 					if !m.opt.IgnoreErrors {

diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go
@@ -48,10 +48,10 @@ func init() {
 
 	flag := Bulk.Cmd.Flags()
 	flag.StringP("rdfs", "r", "",
-		"Directory containing *.rdf or *.rdf.gz files to load.")
+		"Location of RDF data to load.")
 	// would be nice to use -j to match -r, but already used by --num_go_routines
 	flag.String("jsons", "",
-		"Directory containing *.json or *.json.gz files to load.")
+		"Location of JSON data to load.")
 	flag.StringP("schema_file", "s", "",
 		"Location of schema file to load.")
 	flag.String("out", "out",
@@ -91,6 +91,7 @@ func init() {
 			"more parallelism, but increases memory usage.")
 	flag.String("custom_tokenizers", "",
 		"Comma separated list of tokenizer plugins")
+	flag.StringP("key", "k", "", "Comma-separated list of JSON fields to identify a uid")
 }
 
 func run() {
@@ -114,14 +115,15 @@ func run() {
 		MapShards:        Bulk.Conf.GetInt("map_shards"),
 		ReduceShards:     Bulk.Conf.GetInt("reduce_shards"),
 		CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"),
+		KeyFields:        Bulk.Conf.GetString("key"),
 	}
 
 	x.PrintVersion()
 	if opt.Version {
 		os.Exit(0)
 	}
 	if opt.SchemaFile == "" {
-		fmt.Fprint(os.Stderr, "schema file must be specified.\n")
+		fmt.Fprint(os.Stderr, "Schema file must be specified.\n")
 		os.Exit(1)
 	}
 	if opt.RDFDir == "" && opt.JSONDir == "" {
@@ -178,6 +180,10 @@ func run() {
 		defer os.RemoveAll(opt.TmpDir)
 	}
 
+	for _, f := range strings.Split(opt.KeyFields, ",") {
+		opt.parsedKeyFields = append(opt.parsedKeyFields, strings.TrimSpace(f))
+	}
+
 	loader := newLoader(opt)
 	if !opt.SkipMapPhase {
 		loader.mapStage()

diff --git a/dgraph/cmd/live/batch.go b/dgraph/cmd/live/batch.go
@@ -69,13 +69,13 @@ type loader struct {
 	retryRequestsWg sync.WaitGroup
 
 	// Miscellaneous information to print counters.
-	// Num of RDF's sent
-	rdfs uint64
+	// Num of N-Quads sent
+	nquads uint64
 	// Num of txns sent
 	txns uint64
 	// Num of aborts
 	aborts uint64
-	// To get time elapsel.
+	// To get time elapsed
 	start time.Time
 
 	reqs     chan api.Mutation
@@ -109,8 +109,8 @@ func (p *uidProvider) ReserveUidRange() (uint64, uint64, error) {
 // Counter keeps a track of various parameters about a batch mutation. Running totals are printed
 // if BatchMutationOptions PrintCounters is set to true.
 type Counter struct {
-	// Number of RDF's processed by server.
-	Rdfs uint64
+	// Number of N-Quads processed by server.
+	Nquads uint64
 	// Number of mutations processed by the server.
 	TxnsDone uint64
 	// Number of Aborts
@@ -148,7 +148,7 @@ func (l *loader) infinitelyRetry(req api.Mutation) {
 		req.CommitNow = true
 		_, err := txn.Mutate(l.opts.Ctx, &req)
 		if err == nil {
-			atomic.AddUint64(&l.rdfs, uint64(len(req.Set)))
+			atomic.AddUint64(&l.nquads, uint64(len(req.Set)))
 			atomic.AddUint64(&l.txns, 1)
 			return
 		}
@@ -167,7 +167,7 @@ func (l *loader) request(req api.Mutation) {
 	_, err := txn.Mutate(l.opts.Ctx, &req)
 
 	if err == nil {
-		atomic.AddUint64(&l.rdfs, uint64(len(req.Set)))
+		atomic.AddUint64(&l.nquads, uint64(len(req.Set)))
 		atomic.AddUint64(&l.txns, 1)
 		return
 	}
@@ -193,17 +193,17 @@ func (l *loader) printCounters() {
 
 	for range l.ticker.C {
 		counter := l.Counter()
-		rate := float64(counter.Rdfs) / counter.Elapsed.Seconds()
+		rate := float64(counter.Nquads) / counter.Elapsed.Seconds()
 		elapsed := time.Since(start).Round(time.Second)
-		fmt.Printf("[%6s] Txns: %d RDFs: %d RDFs/sec: %5.0f Aborts: %d\n",
-			elapsed, counter.TxnsDone, counter.Rdfs, rate, counter.Aborts)
+		fmt.Printf("[%6s] Txns: %d N-Quads: %d N-Quads/sec: %5.0f Aborts: %d\n",
+			elapsed, counter.TxnsDone, counter.Nquads, rate, counter.Aborts)
 	}
 }
 
 // Counter returns the current state of the BatchMutation.
 func (l *loader) Counter() Counter {
 	return Counter{
-		Rdfs:     atomic.LoadUint64(&l.rdfs),
+		Nquads:   atomic.LoadUint64(&l.nquads),
 		TxnsDone: atomic.LoadUint64(&l.txns),
 		Elapsed:  time.Since(l.start),
 		Aborts:   atomic.LoadUint64(&l.aborts),