Skip to content

Commit

Permalink
#251 Fix glob support and divibility check for large amount of files.
Browse files Browse the repository at this point in the history
  • Loading branch information
yruslan committed Feb 22, 2020
1 parent 15a0d28 commit 5ac819c
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,9 @@ private [source] object CobolScanners {

private def areThereNonDivisibleFiles(sourceDir: String, hadoopConfiguration: Configuration, divisor: Int): Boolean = {

val THRESHOLD_DIR_LENGTH_FOR_SINGLE_FILE_CHECK = 50

val fileSystem = FileSystem.get(hadoopConfiguration)

if (FileUtils.getNumberOfFilesInDir(sourceDir, fileSystem) < THRESHOLD_DIR_LENGTH_FOR_SINGLE_FILE_CHECK) {
if (FileUtils.getNumberOfFilesInDir(sourceDir, fileSystem) < FileUtils.THRESHOLD_DIR_LENGTH_FOR_SINGLE_FILE_CHECK) {
FileUtils.findAndLogAllNonDivisibleFiles(sourceDir, divisor, fileSystem) > 0
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ object FileUtils {

private val logger = LoggerFactory.getLogger(this.getClass)

val THRESHOLD_DIR_LENGTH_FOR_SINGLE_FILE_CHECK = 50

private val hiddenFileFilter = new PathFilter() {
def accept(p: Path): Boolean = {
val name = p.getName
Expand Down Expand Up @@ -195,9 +197,10 @@ object FileUtils {
*/
def findAndLogFirstNonDivisibleFile(sourceDir: String, divisor: Long, fileSystem: FileSystem): Boolean = {

val allFiles = fileSystem.listStatus(new Path(sourceDir))
val allFiles = expandDirectories(fileSystem, fileSystem.globStatus(new Path(sourceDir), hiddenFileFilter))

val firstNonDivisibleFile = allFiles.find(isNonDivisible(_, divisor))
val firstNonDivisibleFile = allFiles.take(THRESHOLD_DIR_LENGTH_FOR_SINGLE_FILE_CHECK)
.find(isNonDivisible(_, divisor))

if (firstNonDivisibleFile.isDefined) {
logger.error(s"File ${firstNonDivisibleFile.get.getPath} IS NOT divisible by $divisor.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ package za.co.absa.cobrix.spark.cobol.source.fixtures

import java.io.{DataOutputStream, File, FileOutputStream}
import java.nio.charset.Charset
import java.nio.file.{Files, Path}

import org.apache.commons.io.{FileSystemUtils, FileUtils}

/**
* This fixture adds ability for a unit test to create temporary files for using them in the tests.
Expand Down Expand Up @@ -67,5 +70,14 @@ trait BinaryFileFixture {
tempFile.delete
}

def withTempDirectory(prefix: String)(f: String => Unit): Unit = {
val tmpPath = Files.createTempDirectory(prefix)
val pathStr = tmpPath.toAbsolutePath.toString

f(pathStr)

FileUtils.deleteDirectory(new File(pathStr))
}

}

Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package za.co.absa.cobrix.spark.cobol.source.regression

import java.io.{DataOutputStream, File, FileOutputStream}
import java.nio.file.{FileSystem, Files, Path, Paths}

import org.apache.hadoop.fs.{FileSystem => HadoopFs}
import org.scalatest.FunSuite
import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
import za.co.absa.cobrix.spark.cobol.utils.FileUtils

class Test07IgnoreHiddenFiles extends FunSuite with BinaryFileFixture with SparkTestBase {
private val fileSystem = HadoopFs.get(spark.sparkContext.hadoopConfiguration)

test("Test findAndLogFirstNonDivisibleFile() finds a file") {
withTempDirectory("testHidden1") { tmpDir =>
createFileSize1(Files.createFile(Paths.get(tmpDir, "a")))
assert(FileUtils.findAndLogFirstNonDivisibleFile(tmpDir, 2, fileSystem))
assert(FileUtils.findAndLogAllNonDivisibleFiles(tmpDir, 2, fileSystem) == 1)
}
}

test("Test findAndLogFirstNonDivisibleFile() ignores a hidden file") {
withTempDirectory("testHidden1") { tmpDir =>
createFileSize1(Files.createFile(Paths.get(tmpDir, ".a")))
assert(!FileUtils.findAndLogFirstNonDivisibleFile(tmpDir, 2, fileSystem))
assert(FileUtils.findAndLogAllNonDivisibleFiles(tmpDir, 2, fileSystem) == 0)
}
}

test("Test findAndLogFirstNonDivisibleFile() ignores a hidden file in a nested dir") {
withTempDirectory("testHidden3") { tmpDir =>
Files.createDirectory(Paths.get(tmpDir, "dir1"))
createFileSize1(Files.createFile(Paths.get(tmpDir, "dir1", ".b2")))
assert(!FileUtils.findAndLogFirstNonDivisibleFile(tmpDir, 2, fileSystem))
assert(FileUtils.findAndLogAllNonDivisibleFiles(tmpDir, 2, fileSystem) == 0)
}
}

test("Test findAndLogFirstNonDivisibleFile() ignores a hidden dir") {
withTempDirectory("testHidden4") { tmpDir =>
Files.createDirectory(Paths.get(tmpDir, ".dir2"))
createFileSize1(Files.createFile(Paths.get(tmpDir, ".dir2", "c1")))
assert(!FileUtils.findAndLogFirstNonDivisibleFile(tmpDir, 2, fileSystem))
assert(FileUtils.findAndLogAllNonDivisibleFiles(tmpDir, 2, fileSystem) == 0)
}
}

test("Test findAndLogFirstNonDivisibleFile() works with globbing") {
withTempDirectory("testHidden1") { tmpDir =>
createFileSize1(Files.createFile(Paths.get(tmpDir, "a")))
assert(FileUtils.findAndLogFirstNonDivisibleFile(s"$tmpDir/*", 2, fileSystem))
assert(FileUtils.findAndLogAllNonDivisibleFiles(tmpDir, 2, fileSystem) == 1)
}
}


private def createFileSize1(path: Path): Unit = {
val file = new File(path.toAbsolutePath.toString)
val ostream = new DataOutputStream(new FileOutputStream(file))
val content = Array[Byte](0.toByte)
ostream.write(content)
ostream.close()
}
}

0 comments on commit 5ac819c

Please sign in to comment.