hellofresh · reidab · Aug 19, 2022 · Sep 23, 2022
diff --git a/docs/usage/config.md b/docs/usage/config.md
@@ -19,11 +19,17 @@ You can set a number of keys in the configuration file. Below is a list of all c
   - `Anonymise` - Indicates which columns to anonymise.
   - `Relationships` - Represents a relationship between the table and referenced table.
     - `Table` - The table name.
-    - `ForeignKey` - The table's foreign key. 
+    - `ForeignKey` - The table's foreign key.
     - `ReferencedTable` - The referenced table name.
     - `ReferencedKey` - The referenced table primary key.
+  - `Subsets` - Allows extracting multple sets of data from the same table.
+    - `Name` - The subset name
+    - `Filter` - A Klepto definition to filter results, see _Filter_ above. 
+    - `Anonymise` - A Klepto definition to anonymise results, see _Anonymise_ above.
+    - `Relationships` - Represents a relationship between the table and referenced table, see _Relationships_ above.
 
-### **IgnoreData**
+
+### IgnoreData
 
 You can dump the database structure without importing data by setting the `IgnoreData` value to `true`.
 
@@ -33,7 +39,7 @@ You can dump the database structure without importing data by setting the `Ignor
  IgnoreData = true
 ```
 
-### **Matchers**
+### Matchers
 
 Matchers are variables to store filter data. You can declare a filter once and reuse it among tables:
 
@@ -56,7 +62,7 @@ Matchers are variables to store filter data. You can declare a filter once and r
     Match = "Latest100Users"
 ```
 
-### **Anonymise**
+### Anonymise
 
 You can anonymise specific columns in your table using the `Anonymise` key. Anonymisation is performed by running a Faker against the specified column.
 
@@ -92,7 +98,7 @@ go get github.com/ungerik/pkgreflect
 fake master pkgreflect -notypes -novars -norecurs vendor/github.com/icrowley/fake/
 ```
 
-### **Relationships**
+### Relationships
 
 The `Relationships` key represents a relationship between the table and referenced table.
 
@@ -119,5 +125,46 @@ To dump the latest 100 users with their orders:
       created_at = "desc"
 ```
 
+### Subsets
+
+Using `Subsets`, you can extract multiple sets of data from a single table.
+
+The same `Filter`, `Anonymise`, and `Relationships` blocks available at the root level can be nested within a subset.
+
+In order to avoid primary key conflicts on insert, it's important to write subset queries so that each record will only fall into a single subset. In the following example, we exclude admin users from the `RecentUsers` subset to avoid conflicts.
+
+To extract the last 100 anonymised users, plus unmodified admin users:
+
+```toml
+[[Tables]]
+  Name = "users"
+
+  [[Tables.Subsets]]
+    Name = "RecentUsers"
+
+    [Tables.Subsets.Filter]
+      Match = "users.admin = false"
+      Limit = 100
+      [Tables.Subsets.Filter.Sorts]
+        created_at = "desc"
+
+    [Tables.Subsets.Anonymise]
+      name = "FullName"
+      email = "EmailAddress"
+
+  [[Tables.Subsets]]
+    Name = "Admins"
+
+    [Tables.Subsets.Filter]
+      Match = "users.admin = true"
+
+    [[Tables.Subsets.Relationships]]
+      ForeignKey = "business_id"
+      ReferencedTable = "businesses"
+      ReferencedKey = "id"
+```
+
+---
+
 !!! info "Tip"
     You can find some [configuration examples](https://github.com/hellofresh/klepto/tree/master/examples) in Klepto's repository.
diff --git a/fixtures/.klepto.toml b/fixtures/.klepto.toml
@@ -4,14 +4,18 @@
 [[Tables]]
   Name = "users"
   IgnoreData = false
-  [Tables.Filter]
-    Match = "users.active = TRUE"
-    Limit = 100
-    [Tables.Filter.Sorts]
-      "user.id" = "asc"
-  [Tables.Anonymise]
-    email = "EmailAddress"
-    firstName = "FirstName"
+
+  [[Tables.Subsets]]
+    Name = "active"
+
+    [Tables.Subsets.Filter]
+      Match = "users.active = TRUE"
+      Limit = 100
+      [Tables.Subsets.Filter.Sorts]
+        "user.id" = "asc"
+    [Tables.Subsets.Anonymise]
+      email = "EmailAddress"
+      firstName = "FirstName"
 
 [[Tables]]
   Name = "orders"
@@ -32,3 +36,34 @@
   [Tables.Filter]
     Match = ""
     Limit = 0
+
+[[Tables]]
+	Name = "vegetables"
+
+[[Tables]]
+  Name = "fruits"
+
+  [Tables.Filter]
+    Match = "fruits.color = 'red'"
+    Limit = 10
+
+  [Tables.Anonymise]
+    name = "FirstName"
+
+[[Tables]]
+  Name = "grains"
+
+  [Tables.Filter]
+    Match = "grains.size = 'large'"
+    Limit = 10
+
+  [Tables.Anonymise]
+    weight = "Digits"
+
+  [[Tables.Subsets]]
+    Name = "starchy"
+
+    [Tables.Subsets.Filter]
+      Match = "grains.starchy = TRUE"
+    [Tables.Subsets.Anonymise]
+      name = "FirstName"
diff --git a/pkg/anonymiser/anonymiser.go b/pkg/anonymiser/anonymiser.go
@@ -46,33 +46,44 @@ func NewAnonymiser(source reader.Reader, tables config.Tables) reader.Reader {
 	return &anonymiser{source, tables}
 }
 
-// ReadTable decorates reader.ReadTable method for anonymising rows published from the reader.Reader
-func (a *anonymiser) ReadTable(tableName string, rowChan chan<- database.Row, opts reader.ReadTableOpt) error {
-	logger := log.WithField("table", tableName)
+// ReadSubset decorates reader.ReadSubset method for anonymising rows published from the reader.Reader
+func (a *anonymiser) ReadSubset(tableName string, subsetIndex int, rowChan chan<- database.Row, opts reader.ReadTableOpt) error {
+	var logger = log.WithFields(log.Fields{"table": tableName, "subsetIndex": subsetIndex})
 	logger.Debug("Loading anonymiser config")
+
 	table := a.tables.FindByName(tableName)
-	if table == nil {
+	var subset *config.Subset
+
+	if table != nil {
+		if subsetIndex < len(table.Subsets) {
+			subset = table.Subsets[subsetIndex]
+		}
+	}
+
+	if table == nil || subset == nil {
 		logger.Debug("the table is not configured to be anonymised")
-		return a.Reader.ReadTable(tableName, rowChan, opts)
+		return a.Reader.ReadSubset(tableName, subsetIndex, rowChan, opts)
 	}
 
-	if len(table.Anonymise) == 0 {
+	logger = log.WithFields(log.Fields{"table": tableName, "subset": subset.Name})
+
+	if len(subset.Anonymise) == 0 {
 		logger.Debug("Skipping anonymiser")
-		return a.Reader.ReadTable(tableName, rowChan, opts)
+		return a.Reader.ReadSubset(tableName, subsetIndex, rowChan, opts)
 	}
 
 	// Create read/write chanel
 	rawChan := make(chan database.Row)
 
-	go func(rowChan chan<- database.Row, rawChan chan database.Row, table *config.Table) {
+	go func(rowChan chan<- database.Row, rawChan chan database.Row, subset *config.Subset) {
 		for {
 			row, more := <-rawChan
 			if !more {
 				close(rowChan)
 				return
 			}
 
-			for column, fakerType := range table.Anonymise {
+			for column, fakerType := range subset.Anonymise {
 				if strings.HasPrefix(fakerType, literalPrefix) {
 					row[column] = strings.TrimPrefix(fakerType, literalPrefix)
 					continue
@@ -109,9 +120,9 @@ func (a *anonymiser) ReadTable(tableName string, rowChan chan<- database.Row, op
 
 			rowChan <- row
 		}
-	}(rowChan, rawChan, table)
+	}(rowChan, rawChan, subset)
 
-	if err := a.Reader.ReadTable(tableName, rawChan, opts); err != nil {
+	if err := a.Reader.ReadSubset(tableName, subsetIndex, rawChan, opts); err != nil {
 		return fmt.Errorf("anonymiser: error while reading table: %w", err)
 	}