Skip to content

Commit

Permalink
Add a test dataset for classifyapp.
Browse files Browse the repository at this point in the history
This adds a target //programl/test/data:classifyapp_dataset which
defines a tarball of test data for the classifyapp task.

github.com//issues/119
  • Loading branch information
ChrisCummins committed Aug 30, 2020
1 parent fd55e6c commit 27d73c3
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 0 deletions.
27 changes: 27 additions & 0 deletions programl/test/data/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -3089,6 +3089,33 @@ filegroup(
srcs = ["module_with_unreachable_instructions.ll"],
)

genrule(
name = "classifyapp_dataset",
testonly = 1,
outs = ["classifyapp_dataset.tar.bz2"],
cmd = (
"$(location :make_classifyapp_dataset) --path=$(@D)/dtmp && " +
"tar cjf $(@D)/classifyapp_dataset.tar.bz2 -C $(@D)/dtmp . && " +
"rm -rf $(@D)/dtmp"
),
tools = [":make_classifyapp_dataset"],
)

py_binary(
name = "make_classifyapp_dataset",
testonly = 1,
srcs = ["make_classifyapp_dataset.py"],
data = [
":llvm_ir",
":llvm_ir_graphs",
"//programl/task/dataflow/dataset:create_vocab",
],
deps = [
"//programl/proto:program_graph_py",
"//third_party/py/labm8",
],
)

genrule(
name = "reachability_dataflow_dataset",
testonly = 1,
Expand Down
99 changes: 99 additions & 0 deletions programl/test/data/make_classifyapp_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2019-2020 the ProGraML authors.
#
# Contact Chris Cummins <[email protected]>.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create a mini classifyapp dataset using test data.
Usage:
$ bazel run //programl/test/data:make_classifyapp_dataset \
--path /path/to/generated/dataset
"""
import os
import random
import shutil
import subprocess
from pathlib import Path

from labm8.py import app, bazelutil, pbutil

from programl.proto.program_graph_pb2 import ProgramGraph

app.DEFINE_string("path", None, "The path of to write the generated dataset to.")
FLAGS = app.FLAGS


LLVM_IR = bazelutil.DataPath("programl/programl/test/data/llvm_ir")

LLVM_IR_GRAPHS = bazelutil.DataPath("programl/programl/test/data/llvm_ir_graphs")

CREATE_VOCAB = bazelutil.DataPath(
"programl/programl/task/dataflow/dataset/create_vocab"
)


def make_classifyapp_dataset(root: Path) -> Path:
"""Make a miniature dataset for classifyapp.
Args:
root: The root of the dataset.
Returns:
The root of the dataset.
"""
(root / "train").mkdir(parents=True)
(root / "val").mkdir()
(root / "test").mkdir()

shutil.copytree(LLVM_IR_GRAPHS, root / "graphs")
shutil.copytree(LLVM_IR, root / "ir")

# Assign a random POJ-104 label to each graph.
for path in (root / "graphs").iterdir():
graph = pbutil.FromFile(path, ProgramGraph())
graph.features.feature["poj104_label"].int64_list.value[:] = [
random.randint(1, 104)
]
pbutil.ToFile(graph, path)

ngraphs = len(list(LLVM_IR_GRAPHS.iterdir()))
ntrain = int(ngraphs * 0.6)
nval = int(ngraphs * 0.8)

for i, graph in enumerate(LLVM_IR_GRAPHS.iterdir()):
if i < ntrain:
dst = "train"
elif i < nval:
dst = "val"
else:
dst = "test"
name = graph.name[: -len(".ProgramGraph.pb")]
os.symlink(
f"../graphs/{name}.ProgramGraph.pb",
root / dst / f"{name}.ProgramGraph.pb",
)

subprocess.check_call([str(CREATE_VOCAB), "--path", str(root)])

return root


def main():
"""Main entry point."""
assert FLAGS.path
make_classifyapp_dataset(Path(FLAGS.path))


if __name__ == "__main__":
app.Run(main)
19 changes: 19 additions & 0 deletions programl/test/py/plugins/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,25 @@

package(default_visibility = ["//programl:__subpackages__"])

py_library(
name = "classifyapp_dataset",
testonly = 1,
srcs = ["classifyapp_dataset.py"],
data = ["//programl/test/data:classifyapp_dataset"],
deps = [
"//third_party/py/labm8",
],
)

py_test(
name = "classifyapp_dataset_test",
srcs = ["classifyapp_dataset.py"],
deps = [
":classifyapp_dataset",
"//third_party/py/labm8",
],
)

py_library(
name = "llvm_ir",
testonly = 1,
Expand Down
29 changes: 29 additions & 0 deletions programl/test/py/plugins/classifyapp_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2019-2020 the ProGraML authors.
#
# Contact Chris Cummins <[email protected]>.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path

from labm8.py import bazelutil

CLASSIFYAPP_DATASET = bazelutil.DataArchive(
"programl/test/data/classifyapp_dataset.tar.bz2"
)


@test.Fixture(scope="function")
def classifyapp_dataset() -> Path:
"""A test fixture which yields the root of a dataflow dataset."""
with CLASSIFYAPP_DATASET as d:
yield d
33 changes: 33 additions & 0 deletions programl/test/py/plugins/classifyapp_dataset_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2019-2020 the ProGraML authors.
#
# Contact Chris Cummins <[email protected]>.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Unit tests for //programl/test/py/plugins:classifyapp_dataset."""
from pathlib import Path

from labm8.py import test

pytest_plugins = ["programl.test.py.plugins.classifyapp_dataset"]


def test_classifyapp_dataset(classifyapp_dataset: Path):
assert (classifyapp_dataset / "ir").is_dir()
assert (classifyapp_dataset / "graphs").is_dir()
assert (classifyapp_dataset / "train").is_dir()
assert (classifyapp_dataset / "val").is_dir()
assert (classifyapp_dataset / "test").is_dir()


if __name__ == "__main__":
test.Main()

0 comments on commit 27d73c3

Please sign in to comment.