Skip to content

Commit

Permalink
Improve Prov module (InseeFr#370)
Browse files Browse the repository at this point in the history
* Start prov improvements

* Create VtlTypes.java

* Init runWithBindings implementation

* Improve provenance
  • Loading branch information
NicoLaval authored Nov 12, 2024
1 parent 20f01f3 commit 591bb9b
Show file tree
Hide file tree
Showing 10 changed files with 367 additions and 46 deletions.
28 changes: 28 additions & 0 deletions vtl-prov/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,21 @@
<artifactId>vtl-parser</artifactId>
<version>1.8.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>fr.insee.trevas</groupId>
<artifactId>vtl-engine</artifactId>
<version>1.8.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>fr.insee.trevas</groupId>
<artifactId>vtl-spark</artifactId>
<version>1.8.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>fr.insee.trevas</groupId>
<artifactId>vtl-model</artifactId>
<version>1.8.0-SNAPSHOT</version>
</dependency>

<!-- Jena 5 requires Java 17+ -->
<dependency>
Expand All @@ -45,4 +60,17 @@

</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<argLine>--add-exports java.base/sun.nio.ch=ALL-UNNAMED</argLine>
</configuration>
</plugin>
</plugins>
</build>

</project>
142 changes: 125 additions & 17 deletions vtl-prov/src/main/java/fr/insee/vtl/prov/ProvenanceListener.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
package fr.insee.vtl.prov;

import fr.insee.vtl.model.Dataset;
import fr.insee.vtl.parser.VtlBaseListener;
import fr.insee.vtl.parser.VtlLexer;
import fr.insee.vtl.parser.VtlParser;
import fr.insee.vtl.prov.prov.DataframeInstance;
import fr.insee.vtl.prov.prov.Program;
import fr.insee.vtl.prov.prov.ProgramStep;
import fr.insee.vtl.prov.prov.VariableInstance;
import fr.insee.vtl.prov.utils.ProvenanceUtils;
import org.antlr.v4.runtime.*;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.tree.ParseTreeWalker;

import java.util.Set;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
* ANTLR Listener that create provenance objects.
Expand All @@ -26,6 +31,22 @@ public class ProvenanceListener extends VtlBaseListener {

private String currentComponentID;

private String currentDataframeID;

private int stepIndex = 1;

private boolean rootAssignment = true;

// Map of label/UUID id
private final Map<String, String> availableDataframeUUID = new HashMap<>();

private Map<String, String> currentAvailableDataframeUUID = new HashMap<>();

// Map of label/UUID id
private final Map<String, String> availableVariableUUID = new HashMap<>();

private Map<String, String> currentAvailableVariableUUID = new HashMap<>();

public ProvenanceListener(String id, String programName) {
program.setId(id);
program.setLabel(programName);
Expand All @@ -44,51 +65,70 @@ public void enterStart(VtlParser.StartContext ctx) {

@Override
public void enterTemporaryAssignment(VtlParser.TemporaryAssignmentContext ctx) {
String id = getText(ctx.varID());
String label = getText(ctx.varID());
String sourceCode = getText(ctx);
currentProgramStep = id;
ProgramStep programStep = new ProgramStep(id, id, sourceCode);
DataframeInstance df = new DataframeInstance(id, id);
currentProgramStep = label;
ProgramStep programStep = new ProgramStep(label, sourceCode, stepIndex);
stepIndex++;
String dfId = UUID.randomUUID().toString();
currentAvailableDataframeUUID.put(label, dfId);
DataframeInstance df = new DataframeInstance(dfId, label);
programStep.setProducedDataframe(df);
program.getProgramSteps().add(programStep);
}

@Override
public void exitTemporaryAssignment(VtlParser.TemporaryAssignmentContext ctx) {
currentProgramStep = null;
availableDataframeUUID.putAll(currentAvailableDataframeUUID);
currentAvailableDataframeUUID = new HashMap<>();
rootAssignment = true;
}

@Override
public void enterPersistAssignment(VtlParser.PersistAssignmentContext ctx) {
String id = getText(ctx.varID());
String label = getText(ctx.varID());
String sourceCode = getText(ctx);
currentProgramStep = id;
ProgramStep programStep = new ProgramStep(id, id, sourceCode);
DataframeInstance df = new DataframeInstance(id, id);
currentProgramStep = label;
ProgramStep programStep = new ProgramStep(label, sourceCode, stepIndex);
stepIndex++;
String dfId = UUID.randomUUID().toString();
currentAvailableDataframeUUID.put(label, dfId);
DataframeInstance df = new DataframeInstance(dfId, label);
programStep.setProducedDataframe(df);
program.getProgramSteps().add(programStep);
}

@Override
public void exitPersistAssignment(VtlParser.PersistAssignmentContext ctx) {
currentProgramStep = null;
availableDataframeUUID.putAll(currentAvailableDataframeUUID);
currentAvailableDataframeUUID = new HashMap<>();
rootAssignment = true;
}

@Override
public void enterVarID(VtlParser.VarIDContext ctx) {
String id = ctx.IDENTIFIER().getText();
if (!id.equals(currentProgramStep)) {
ProgramStep programStep = program.getProgramStepById(currentProgramStep);
String label = ctx.IDENTIFIER().getText();
if (!rootAssignment) {
ProgramStep programStep = program.getProgramStepByLabel(currentProgramStep);
if (!isInDatasetClause) {
Set<DataframeInstance> consumedDataframe = programStep.getConsumedDataframe();
DataframeInstance df = new DataframeInstance(id, id);
String dfId = ProvenanceUtils.getOrBuildUUID(availableDataframeUUID, label);
DataframeInstance df = new DataframeInstance(dfId, label);
consumedDataframe.add(df);
// Certainly don't need to reset? To check!
currentDataframeID = label;
}
if (isInDatasetClause && null != currentComponentID) {
Set<VariableInstance> usedVariables = programStep.getUsedVariables();
VariableInstance v = new VariableInstance(id, id);
String varUUID = ProvenanceUtils.getOrBuildUUID(availableVariableUUID, currentDataframeID + "|" + label);
VariableInstance v = new VariableInstance(varUUID, label);
v.setParentDataframe(currentDataframeID);
usedVariables.add(v);
}
} else {
rootAssignment = false;
}
}

Expand All @@ -104,11 +144,18 @@ public void exitDatasetClause(VtlParser.DatasetClauseContext ctx) {

@Override
public void enterComponentID(VtlParser.ComponentIDContext ctx) {
String id = ctx.getText();
ProgramStep programStep = program.getProgramStepById(currentProgramStep);
String label = ctx.getText();
ProgramStep programStep = program.getProgramStepByLabel(currentProgramStep);
Set<VariableInstance> assignedVariables = programStep.getAssignedVariables();
VariableInstance v = new VariableInstance(id, id);
String variableUUID = ProvenanceUtils.getOrBuildUUID(availableDataframeUUID, label);
VariableInstance v = new VariableInstance(variableUUID, label);
assignedVariables.add(v);
currentAvailableVariableUUID.put(currentDataframeID + "|" + label, variableUUID);
}

@Override
public void exitComponentID(VtlParser.ComponentIDContext ctx) {
System.out.println(ctx.IDENTIFIER());
}

@Override
Expand All @@ -119,6 +166,8 @@ public void enterCalcClauseItem(VtlParser.CalcClauseItemContext ctx) {
@Override
public void exitCalcClauseItem(VtlParser.CalcClauseItemContext ctx) {
currentComponentID = null;
availableVariableUUID.putAll(currentAvailableVariableUUID);
currentAvailableVariableUUID = new HashMap<>();
}

@Override
Expand Down Expand Up @@ -148,4 +197,63 @@ public static Program run(String expr, String id, String programName) {
return provenanceListener.getProgram();
}

public static Program runWithBindings(ScriptEngine engine, String expr, String id, String programName) {
Program program = run(expr, id, programName);
// 0 check if input dataset are empty?
// Keep already handled dataset
List<String> dsHandled = new ArrayList<>();
// Split script to loop over program steps and run them
AtomicInteger index = new AtomicInteger(1);
Arrays.stream(expr.split(";"))
.map(e -> e + ";")
.forEach(stepScript -> {
int i = index.getAndIncrement();
// 1 - Handle input dataset
ProgramStep step = program.getProgramStepByIndex(i);
Set<DataframeInstance> consumedDataframe = step.getConsumedDataframe();
consumedDataframe.forEach(d -> {
if (!dsHandled.contains(d.getLabel())) {
Dataset ds = (Dataset) engine.getContext().getAttribute(d.getLabel());
ds.getDataStructure().values().forEach(c -> {
VariableInstance variableInstance = step.getUsedVariables()
.stream()
.filter(v ->
v.getParentDataframe().equals(d.getLabel()) &&
v.getLabel().equals(c.getName()))
.findFirst()
.orElse(new VariableInstance(c.getName()));
variableInstance.setRole(c.getRole());
variableInstance.setType(c.getType());
d.getHasVariableInstances().add(variableInstance);
});
}
});
try {
engine.eval(stepScript);
} catch (ScriptException e) {
throw new RuntimeException(e);
}
// Improve built variables attributes
DataframeInstance producedDataframe = step.getProducedDataframe();
Dataset ds = (Dataset) engine.getContext().getAttribute(producedDataframe.getLabel());

ds.getDataStructure().values().forEach(c -> {
VariableInstance variableInstance = step.getAssignedVariables()
.stream()
.filter(v -> v.getLabel().equals(c.getName()))
.findFirst()
// TODO: refine variable detection in usedVariable
.orElse(new VariableInstance(c.getName()));
variableInstance.setRole(c.getRole());
variableInstance.setType(c.getType());
producedDataframe.getHasVariableInstances().add(variableInstance);
});
dsHandled.add((producedDataframe.getLabel()));
// Correct usedVariables ID checking ds/var of last assignment
});


return program;
}

}
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
package fr.insee.vtl.prov.prov;

import java.util.HashSet;
import java.util.Set;

public class DataframeInstance {
String id;
String label;

Set<VariableInstance> hasVariableInstances = new HashSet<>();

public DataframeInstance(String id, String label) {
this.id = id;
this.label = label;
Expand All @@ -24,4 +29,14 @@ public String getLabel() {
public void setLabel(String label) {
this.label = label;
}

public Set<VariableInstance> getHasVariableInstances() {
return hasVariableInstances;
}

public void setHasVariableInstances(Set<VariableInstance> hasVariableInstances) {
this.hasVariableInstances = hasVariableInstances;
}


}
23 changes: 21 additions & 2 deletions vtl-prov/src/main/java/fr/insee/vtl/prov/prov/Program.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import java.util.HashSet;
import java.util.Set;

/* Filled thanks to listener, except for dataframInstances */
public class Program {

String id;
String label;
Set<ProgramStep> programSteps = new HashSet<>();

/* Provided running preview mode */
Set<DataframeInstance> dataframeInstances = new HashSet<>();

String sourceCode;

public Program() {
Expand Down Expand Up @@ -43,6 +47,14 @@ public void setProgramSteps(Set<ProgramStep> programSteps) {
this.programSteps = programSteps;
}

public Set<DataframeInstance> getDataframeInstances() {
return dataframeInstances;
}

public void setDataframeInstances(Set<DataframeInstance> dataframeInstances) {
this.dataframeInstances = dataframeInstances;
}

public String getSourceCode() {
return sourceCode;
}
Expand All @@ -51,9 +63,16 @@ public void setSourceCode(String sourceCode) {
this.sourceCode = sourceCode;
}

public ProgramStep getProgramStepById(String id) {
public ProgramStep getProgramStepByLabel(String label) {
return programSteps.stream()
.filter(p -> p.getLabel().equals(label))
.findFirst()
.orElse(null);
}

public ProgramStep getProgramStepByIndex(int index) {
return programSteps.stream()
.filter(p -> p.getId().equals(id))
.filter(p -> p.getIndex() == index)
.findFirst()
.orElse(null);
}
Expand Down
26 changes: 16 additions & 10 deletions vtl-prov/src/main/java/fr/insee/vtl/prov/prov/ProgramStep.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,24 @@

import java.util.HashSet;
import java.util.Set;
import java.util.UUID;

public class ProgramStep {

String id;
String label;
String sourceCode;
Set<VariableInstance> usedVariables = new HashSet<>();;
Set<VariableInstance> assignedVariables = new HashSet<>();;

Set<DataframeInstance> consumedDataframe = new HashSet<>();;
int index;
Set<VariableInstance> usedVariables = new HashSet<>();
Set<VariableInstance> assignedVariables = new HashSet<>();
Set<DataframeInstance> consumedDataframe = new HashSet<>();
DataframeInstance producedDataframe;


public ProgramStep() {
}

public ProgramStep(String id, String label, String sourceCode) {
this.id = id;
public ProgramStep(String label, String sourceCode, int index) {
this.id = UUID.randomUUID().toString();
this.label = label;
this.sourceCode = sourceCode;
this.index = index;
}

public String getId() {
Expand All @@ -44,6 +42,14 @@ public String getSourceCode() {
return sourceCode;
}

public int getIndex() {
return index;
}

public void setIndex(int index) {
this.index = index;
}

public void setSourceCode(String sourceCode) {
this.sourceCode = sourceCode;
}
Expand Down
Loading

0 comments on commit 591bb9b

Please sign in to comment.