diff --git a/README.md b/README.md
index 7d7fc45f..8bd24f20 100644
--- a/README.md
+++ b/README.md
@@ -1517,6 +1517,7 @@ The output looks like this:
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
| .option("strict_sign_overpunching", "true") | If `true` (default), sign overpunching will only be allowed for signed numbers. If `false`, overpunched positive sign will be allowed for unsigned numbers, but negative sign will result in null. |
| .option("improved_null_detection", "true") | If `true`(default), values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |
+| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. |
| .option("binary_as_hex", "false") | By default fields that have `PIC X` and `USAGE COMP` are converted to `binary` Spark data type. If this option is set to `true`, such fields will be strings in HEX encoding. |
##### Modifier options
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala
index f4d3e4f0..1e002457 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala
@@ -87,12 +87,14 @@ object CopybookParser extends Logging {
dropGroupFillers: Boolean = false,
dropValueFillers: Boolean = true,
commentPolicy: CommentPolicy = CommentPolicy(),
- dropFillersFromAst: Boolean = false
+ dropFillersFromAst: Boolean = false,
+ strictIntegralPrecision: Boolean = false
): Copybook = {
val copybook = parse(copyBookContents = copyBookContents,
dropGroupFillers = dropGroupFillers,
dropValueFillers = dropValueFillers,
- commentPolicy = commentPolicy)
+ commentPolicy = commentPolicy,
+ strictIntegralPrecision = strictIntegralPrecision)
if (dropFillersFromAst && (dropGroupFillers || dropValueFillers)) {
copybook.dropFillers(dropGroupFillers, dropValueFillers)
@@ -135,6 +137,7 @@ object CopybookParser extends Logging {
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
+ strictIntegralPrecision: Boolean = false,
decodeBinaryAsHex: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
@@ -155,6 +158,7 @@ object CopybookParser extends Logging {
commentPolicy,
strictSignOverpunch,
improvedNullDetection,
+ strictIntegralPrecision,
decodeBinaryAsHex,
ebcdicCodePage,
asciiCharset,
@@ -197,6 +201,7 @@ object CopybookParser extends Logging {
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
+ strictIntegralPrecision: Boolean = false,
decodeBinaryAsHex: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
@@ -217,6 +222,7 @@ object CopybookParser extends Logging {
commentPolicy,
strictSignOverpunch,
improvedNullDetection,
+ strictIntegralPrecision,
decodeBinaryAsHex,
ebcdicCodePage,
asciiCharset,
@@ -262,6 +268,7 @@ object CopybookParser extends Logging {
commentPolicy: CommentPolicy,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
+ strictIntegralPrecision: Boolean,
decodeBinaryAsHex: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
@@ -272,7 +279,7 @@ object CopybookParser extends Logging {
debugFieldsPolicy: DebugFieldsPolicy,
fieldCodePageMap: Map[String, String]): Copybook = {
- val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
+ val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
val nonTerms: Set[String] = (for (id <- nonTerminals)
yield transformIdentifier(id)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala
index 6e1364a9..670f4ec5 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala
@@ -57,13 +57,14 @@ object ANTLRParser extends Logging {
commentPolicy: CommentPolicy,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
+ strictIntegralPrecision: Boolean,
decodeBinaryAsHex: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat,
fieldCodePageMap: Map[String, String]): CopybookAST = {
- val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, fieldCodePageMap)
+ val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)
val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
line =>
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
index 7b102647..20908ad0 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
@@ -47,6 +47,7 @@ class ParserVisitor(enc: Encoding,
floatingPointFormat: FloatingPointFormat,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
+ strictIntegralPrecision: Boolean,
decodeBinaryAsHex: Boolean,
fieldCodePageMap: Map[String, String]) extends copybookParserBaseVisitor[Expr] {
/* expressions */
@@ -853,7 +854,7 @@ class ParserVisitor(enc: Encoding,
Map(),
isDependee = false,
identifier.toUpperCase() == Constants.FILLER,
- DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
+ DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
) (Some(parent))
parent.children.append(prim)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
index b26e5491..e99e04fb 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
@@ -42,13 +42,15 @@ object DecoderSelector {
*
Integral types are represented as boxed integers and longs. Larger integral numbers are represented as BigDecimal
*
*
- * @param dataType A daatype of a copybook field
- * @param stringTrimmingPolicy Specifies how the decoder should handle string types
- * @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion
- * @param asciiCharset A charset for ASCII encoded data
- * @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
- * @param floatingPointFormat Specifies a floating point format (IBM or IEEE754)
- * @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers.
+ * @param dataType A datatype of a copybook field
+ * @param stringTrimmingPolicy Specifies how the decoder should handle string types
+ * @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion
+ * @param asciiCharset A charset for ASCII encoded data
+ * @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
+ * @param floatingPointFormat Specifies a floating point format (IBM or IEEE754)
+ * @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers.
+ * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
+ * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @return A function that converts an array of bytes to the target data type.
*/
def getDecoder(dataType: CobolType,
@@ -58,11 +60,12 @@ object DecoderSelector {
isUtf16BigEndian: Boolean = true,
floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM,
strictSignOverpunch: Boolean = false,
- improvedNullDetection: Boolean = false): Decoder = {
+ improvedNullDetection: Boolean = false,
+ strictIntegralPrecision: Boolean = false): Decoder = {
val decoder = dataType match {
case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection)
case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
- case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection)
+ case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
case _ => throw new IllegalStateException("Unknown AST object")
}
decoder
@@ -186,7 +189,8 @@ object DecoderSelector {
/** Gets a decoder function for an integral data type. A direct conversion from array of bytes to the target type is used where possible. */
private def getIntegralDecoder(integralType: Integral,
strictSignOverpunch: Boolean,
- improvedNullDetection: Boolean): Decoder = {
+ improvedNullDetection: Boolean,
+ strictIntegralPrecision: Boolean): Decoder = {
val encoding = integralType.enc.getOrElse(EBCDIC)
val isEbcidic = encoding match {
@@ -198,7 +202,12 @@ object DecoderSelector {
integralType.compact match {
case None =>
- if (integralType.precision <= Constants.maxIntegerPrecision) {
+ if (strictIntegralPrecision) {
+ if (isEbcidic)
+ StringDecoders.decodeEbcdicBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
+ else
+ StringDecoders.decodeAsciiBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
+ } else if (integralType.precision <= Constants.maxIntegerPrecision) {
if (isEbcidic)
StringDecoders.decodeEbcdicInt(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
else
@@ -223,57 +232,60 @@ object DecoderSelector {
throw new IllegalStateException("Unexpected error. COMP-2 (double) is incorrect for an integral number.")
case Some(COMP3()) =>
// COMP-3 aka BCD-encoded number
- getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true)
+ getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true, strictIntegralPrecision)
case Some(COMP3U()) =>
// COMP-3U aka Unsigned BCD-encoded number aka Unsigned Packed
- getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false)
+ getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false, strictIntegralPrecision)
case Some(COMP4()) =>
// COMP aka BINARY encoded number
- getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true)
+ getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision)
case Some(COMP5()) =>
// COMP aka BINARY encoded number
- getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true)
+ getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision)
case Some(COMP9()) =>
// COMP aka BINARY encoded number
- getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false)
+ getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false, strictIntegralPrecision)
case _ =>
throw new IllegalStateException(s"Unknown number compression format (${integralType.compact.get}).")
}
}
/** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
- private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean): Decoder = {
+ private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = {
val isSigned = signPosition.nonEmpty
- val isSignLeft = signPosition.forall(sp => if (sp == za.co.absa.cobrix.cobol.parser.position.Left) true else false)
val numOfBytes = BinaryUtils.getBytesCount(compact, precision, isSigned, isExplicitDecimalPt = false, isSignSeparate = false)
- val decoder = (isSigned, isBigEndian, numOfBytes) match {
- case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
- case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
- case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
- case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
- case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
- case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
- case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
- case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
- case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
- case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
- case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
- case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
- case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
- case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
- case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
- case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
- case _ =>
- (a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
+ val decoder = if (strictIntegralPrecision) {
+ (a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
+ } else {
+ (isSigned, isBigEndian, numOfBytes) match {
+ case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
+ case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
+ case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
+ case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
+ case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
+ case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
+ case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
+ case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
+ case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
+ case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
+ case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
+ case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
+ case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
+ case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
+ case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
+ case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
+ case _ =>
+ (a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
+ }
}
decoder // 999 999 999
}
/** Gets a decoder function for a BCD-encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
- private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean): Decoder = {
+ private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean, strictIntegralPrecision: Boolean): Decoder = {
val decoder =
- if (precision <= Constants.maxIntegerPrecision) {
+ if (precision <= Constants.maxIntegerPrecision && !strictIntegralPrecision) {
a: Array[Byte] => {
val num = BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble)
if (num != null) {
@@ -282,7 +294,7 @@ object DecoderSelector {
null
}
}
- } else if (precision <= Constants.maxLongPrecision) {
+ } else if (precision <= Constants.maxLongPrecision && !strictIntegralPrecision) {
a: Array[Byte] => BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble)
} else {
a: Array[Byte] =>
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
index ae1ebb19..efda1f4d 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
@@ -30,20 +30,20 @@ object RecordExtractors {
/**
* This method extracts a record from the specified array of bytes. The copybook for the record needs to be already parsed.
*
- * @param ast The parsed copybook.
- * @param data The data bits containing the record.
- * @param offsetBytes The offset to the beginning of the record (in bits).
- * @param policy A schema retention policy to be applied to the extracted record.
- * @param variableLengthOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements.
- * @param generateRecordId If true, a record id field will be added as the first field of the record.
- * @param generateRecordBytes If true, a record bytes field will be added at the beginning of each record.
- * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on.
- * @param fileId A file id to be put to the extractor record if generateRecordId == true.
- * @param recordId The record id to be saved to the record id field.
- * @param activeSegmentRedefine An active segment redefine (the one that will be parsed).
- * All other segment redefines will be skipped.
- * @param generateInputFileField if true, a field containing input file name will be generated
- * @param inputFileName An input file name to put if its generation is needed
+ * @param ast The parsed copybook.
+ * @param data The data bits containing the record.
+ * @param offsetBytes The offset to the beginning of the record (in bits).
+ * @param policy A schema retention policy to be applied to the extracted record.
+ * @param variableLengthOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements.
+ * @param generateRecordId If true, a record id field will be added as the first field of the record.
+ * @param generateRecordBytes If true, a record bytes field will be added at the beginning of each record.
+ * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on.
+ * @param fileId A file id to be put to the extractor record if generateRecordId == true.
+ * @param recordId The record id to be saved to the record id field.
+ * @param activeSegmentRedefine An active segment redefine (the one that will be parsed).
+ * All other segment redefines will be skipped.
+ * @param generateInputFileField if true, a field containing input file name will be generated
+ * @param inputFileName An input file name to put if its generation is needed
* @return An Array[Any] object corresponding to the record schema.
*/
@throws(classOf[IllegalStateException])
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala
index 5ee95651..e9efdcc2 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala
@@ -26,77 +26,79 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
/**
* This class holds parameters for the job.
*
- * @param copybookPath String containing the path to the copybook in a given file system.
- * @param multiCopybookPath Sequence containing the paths to the copybooks.
- * @param copybookContent String containing the actual content of the copybook. Either this, the copybookPath, or multiCopybookPath parameter must be specified.
- * @param sourcePaths The list of source file paths.
- * @param recordFormat The record format (F, V, VB, D)
- * @param isText [deprecated by recordFormat] If true the input data consists of text files where records are separated by a line ending character
- * @param isEbcdic If true the input data file encoding is EBCDIC, otherwise it is ASCII
- * @param ebcdicCodePage Specifies what code page to use for EBCDIC to ASCII/Unicode conversions
- * @param ebcdicCodePageClass An optional custom code page conversion class provided by a user
- * @param asciiCharset A charset for ASCII data
- * @param fieldCodePage Specifies a mapping between a field name and the code page
- * @param isUtf16BigEndian If true UTF-16 is considered big-endian.
- * @param floatingPointFormat A format of floating-point numbers
- * @param recordStartOffset A number of bytes to skip at the beginning of the record before parsing a record according to a copybook
- * @param recordEndOffset A number of bytes to skip at the end of each record
- * @param recordLength Specifies the length of the record disregarding the copybook record size. Implied the file has fixed record length.
- * @param minimumRecordLength Minium record length for which the record is considered valid.
- * @param maximumRecordLength Maximum record length for which the record is considered valid.
- * @param variableLengthParams VariableLengthParameters containing the specifications for the consumption of variable-length Cobol records.
- * @param variableSizeOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements
- * @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
- * @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed
- * @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed
- * @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
- * @param multisegmentParams Parameters for reading multisegment mainframe files
- * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
- * @param decodeBinaryAsHex Decode binary fields as HEX strings
- * @param commentPolicy A comment truncation policy
- * @param dropGroupFillers If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields
- * @param dropValueFillers If true the parser will drop all value FILLER fields
- * @param nonTerminals A list of non-terminals (GROUPS) to combine and parse as primitive fields
- * @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
- * @param debugIgnoreFileSize If true the fixed length file reader won't check file size divisibility. Useful for debugging binary file / copybook mismatches.
- * @param metadataPolicy Specifies the policy of metadat fields to be added to the Spark schema
+ * @param copybookPath String containing the path to the copybook in a given file system.
+ * @param multiCopybookPath Sequence containing the paths to the copybooks.
+ * @param copybookContent String containing the actual content of the copybook. Either this, the copybookPath, or multiCopybookPath parameter must be specified.
+ * @param sourcePaths The list of source file paths.
+ * @param recordFormat The record format (F, V, VB, D)
+ * @param isText [deprecated by recordFormat] If true the input data consists of text files where records are separated by a line ending character
+ * @param isEbcdic If true the input data file encoding is EBCDIC, otherwise it is ASCII
+ * @param ebcdicCodePage Specifies what code page to use for EBCDIC to ASCII/Unicode conversions
+ * @param ebcdicCodePageClass An optional custom code page conversion class provided by a user
+ * @param asciiCharset A charset for ASCII data
+ * @param fieldCodePage Specifies a mapping between a field name and the code page
+ * @param isUtf16BigEndian If true UTF-16 is considered big-endian.
+ * @param floatingPointFormat A format of floating-point numbers
+ * @param recordStartOffset A number of bytes to skip at the beginning of the record before parsing a record according to a copybook
+ * @param recordEndOffset A number of bytes to skip at the end of each record
+ * @param recordLength Specifies the length of the record disregarding the copybook record size. Implied the file has fixed record length.
+ * @param minimumRecordLength Minium record length for which the record is considered valid.
+ * @param maximumRecordLength Maximum record length for which the record is considered valid.
+ * @param variableLengthParams VariableLengthParameters containing the specifications for the consumption of variable-length Cobol records.
+ * @param variableSizeOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements
+ * @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
+ * @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed
+ * @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed
+ * @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
+ * @param multisegmentParams Parameters for reading multisegment mainframe files
+ * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
+ * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
+ * @param decodeBinaryAsHex Decode binary fields as HEX strings
+ * @param commentPolicy A comment truncation policy
+ * @param dropGroupFillers If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields
+ * @param dropValueFillers If true the parser will drop all value FILLER fields
+ * @param nonTerminals A list of non-terminals (GROUPS) to combine and parse as primitive fields
+ * @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
+ * @param debugIgnoreFileSize If true the fixed length file reader won't check file size divisibility. Useful for debugging binary file / copybook mismatches.
+ * @param metadataPolicy Specifies the policy of metadat fields to be added to the Spark schema
*/
case class CobolParameters(
- copybookPath: Option[String],
- multiCopybookPath: Seq[String],
- copybookContent: Option[String],
- sourcePaths: Seq[String],
- recordFormat: RecordFormat,
- isText: Boolean,
- isEbcdic: Boolean,
- ebcdicCodePage: String,
- ebcdicCodePageClass: Option[String],
- asciiCharset: Option[String],
- fieldCodePage: Map[String, String],
- isUtf16BigEndian: Boolean,
- floatingPointFormat: FloatingPointFormat,
- recordStartOffset: Int,
- recordEndOffset: Int,
- recordLength: Option[Int],
- minimumRecordLength: Option[Int],
- maximumRecordLength: Option[Int],
- variableLengthParams: Option[VariableLengthParameters],
- variableSizeOccurs: Boolean,
- generateRecordBytes: Boolean,
- schemaRetentionPolicy: SchemaRetentionPolicy,
- stringTrimmingPolicy: StringTrimmingPolicy,
- allowPartialRecords: Boolean,
- multisegmentParams: Option[MultisegmentParameters],
- commentPolicy: CommentPolicy,
- strictSignOverpunch: Boolean,
- improvedNullDetection: Boolean,
- decodeBinaryAsHex: Boolean,
- dropGroupFillers: Boolean,
- dropValueFillers: Boolean,
- fillerNamingPolicy: FillerNamingPolicy,
- nonTerminals: Seq[String],
- occursMappings: Map[String, Map[String, Int]],
- debugFieldsPolicy: DebugFieldsPolicy,
- debugIgnoreFileSize: Boolean,
- metadataPolicy: MetadataPolicy
+ copybookPath: Option[String],
+ multiCopybookPath: Seq[String],
+ copybookContent: Option[String],
+ sourcePaths: Seq[String],
+ recordFormat: RecordFormat,
+ isText: Boolean,
+ isEbcdic: Boolean,
+ ebcdicCodePage: String,
+ ebcdicCodePageClass: Option[String],
+ asciiCharset: Option[String],
+ fieldCodePage: Map[String, String],
+ isUtf16BigEndian: Boolean,
+ floatingPointFormat: FloatingPointFormat,
+ recordStartOffset: Int,
+ recordEndOffset: Int,
+ recordLength: Option[Int],
+ minimumRecordLength: Option[Int],
+ maximumRecordLength: Option[Int],
+ variableLengthParams: Option[VariableLengthParameters],
+ variableSizeOccurs: Boolean,
+ generateRecordBytes: Boolean,
+ schemaRetentionPolicy: SchemaRetentionPolicy,
+ stringTrimmingPolicy: StringTrimmingPolicy,
+ allowPartialRecords: Boolean,
+ multisegmentParams: Option[MultisegmentParameters],
+ commentPolicy: CommentPolicy,
+ strictSignOverpunch: Boolean,
+ improvedNullDetection: Boolean,
+ strictIntegralPrecision: Boolean,
+ decodeBinaryAsHex: Boolean,
+ dropGroupFillers: Boolean,
+ dropValueFillers: Boolean,
+ fillerNamingPolicy: FillerNamingPolicy,
+ nonTerminals: Seq[String],
+ occursMappings: Map[String, Map[String, Int]],
+ debugFieldsPolicy: DebugFieldsPolicy,
+ debugIgnoreFileSize: Boolean,
+ metadataPolicy: MetadataPolicy
)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
index b002d1ed..36c546ff 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
@@ -63,6 +63,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
* @param multisegment Parameters specific to reading multisegment files
* @param commentPolicy A comment truncation policy
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
+ * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param decodeBinaryAsHex Decode binary fields as HEX strings
* @param dropGroupFillers If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields
* @param dropValueFillers If true the parser will drop all value FILLER fields
@@ -112,6 +113,7 @@ case class ReaderParameters(
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
+ strictIntegralPrecision: Boolean = false,
decodeBinaryAsHex: Boolean = false,
dropGroupFillers: Boolean = false,
dropValueFillers: Boolean = true,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
index 4411b91f..f45f67a7 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
@@ -36,6 +36,7 @@ import scala.collection.immutable.HashMap
*
* @param copybook A parsed copybook.
* @param policy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
+ * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
@@ -45,6 +46,7 @@ import scala.collection.immutable.HashMap
*/
class CobolSchema(val copybook: Copybook,
val policy: SchemaRetentionPolicy,
+ val strictIntegralPrecision: Boolean,
val inputFileNameField: String,
val generateRecordId: Boolean,
val generateRecordBytes: Boolean,
@@ -94,6 +96,7 @@ object CobolSchema {
readerParameters.commentPolicy,
readerParameters.strictSignOverpunch,
readerParameters.improvedNullDetection,
+ readerParameters.strictIntegralPrecision,
readerParameters.decodeBinaryAsHex,
codePage,
asciiCharset,
@@ -116,6 +119,7 @@ object CobolSchema {
readerParameters.commentPolicy,
readerParameters.strictSignOverpunch,
readerParameters.improvedNullDetection,
+ readerParameters.strictIntegralPrecision,
readerParameters.decodeBinaryAsHex,
codePage,
asciiCharset,
@@ -128,7 +132,7 @@ object CobolSchema {
))
val segIdFieldCount = readerParameters.multisegment.map(p => p.segmentLevelIds.size).getOrElse(0)
val segmentIdPrefix = readerParameters.multisegment.map(p => p.segmentIdPrefix).getOrElse("")
- new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy)
+ new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.strictIntegralPrecision, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy)
}
def getCodePage(codePageName: String, codePageClass: Option[String]): CodePage = {
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala
index 083ca342..8c6cd3a9 100644
--- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala
+++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala
@@ -40,6 +40,7 @@ class DataSizeSpec extends AnyFunSuite {
floatingPointFormat = FloatingPointFormat.IBM,
strictSignOverpunch = true,
improvedNullDetection = false,
+ strictIntegralPrecision = false,
decodeBinaryAsHex = false,
fieldCodePageMap = Map.empty)
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala
index 6220c5eb..020b00ff 100644
--- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala
+++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala
@@ -40,6 +40,7 @@ class PicValidationSpec extends AnyFunSuite {
FloatingPointFormat.IBM,
strictSignOverpunch = true,
improvedNullDetection = false,
+ strictIntegralPrecision = false,
decodeBinaryAsHex = false,
fieldCodePageMap = Map.empty)
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
index a94e4b4f..6fc78d8d 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
@@ -71,6 +71,7 @@ object CobolParametersParser extends Logging {
val PARAM_GROUP_FILLERS = "drop_group_fillers"
val PARAM_VALUE_FILLERS = "drop_value_fillers"
val PARAM_FILLER_NAMING_POLICY = "filler_naming_policy"
+ val PARAM_STRICT_INTEGRAL_PRECISION = "strict_integral_precision"
val PARAM_GROUP_NOT_TERMINALS = "non_terminals"
val PARAM_OCCURS_MAPPINGS = "occurs_mappings"
@@ -272,6 +273,7 @@ object CobolParametersParser extends Logging {
parseCommentTruncationPolicy(params),
params.getOrElse(PARAM_STRICT_SIGN_OVERPUNCHING, "true").toBoolean,
params.getOrElse(PARAM_IMPROVED_NULL_DETECTION, "true").toBoolean,
+ params.getOrElse(PARAM_STRICT_INTEGRAL_PRECISION, "false").toBoolean,
params.getOrElse(PARAM_BINARY_AS_HEX, "false").toBoolean,
params.getOrElse(PARAM_GROUP_FILLERS, "false").toBoolean,
params.getOrElse(PARAM_VALUE_FILLERS, "true").toBoolean,
@@ -408,6 +410,7 @@ object CobolParametersParser extends Logging {
parameters.commentPolicy,
parameters.strictSignOverpunch,
parameters.improvedNullDetection,
+ parameters.strictIntegralPrecision,
parameters.decodeBinaryAsHex,
parameters.dropGroupFillers,
parameters.dropValueFillers,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
index 470a5139..490c185b 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
@@ -39,7 +39,8 @@ import scala.collection.mutable.ArrayBuffer
* provides the corresponding Spark schema and also other properties for the Spark data source.
*
* @param copybook A parsed copybook.
- * @param policy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
+ * @param schemaRetentionPolicy pecifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
+ * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
@@ -48,15 +49,16 @@ import scala.collection.mutable.ArrayBuffer
* @param metadataPolicy Specifies a policy to generate metadata fields.
*/
class CobolSchema(copybook: Copybook,
- policy: SchemaRetentionPolicy,
- inputFileNameField: String,
- generateRecordId: Boolean,
+ schemaRetentionPolicy: SchemaRetentionPolicy,
+ strictIntegralPrecision: Boolean = false,
+ inputFileNameField: String = "",
+ generateRecordId: Boolean = false,
generateRecordBytes: Boolean = false,
generateSegIdFieldsCnt: Int = 0,
segmentIdProvidedPrefix: String = "",
metadataPolicy: MetadataPolicy = MetadataPolicy.Basic)
extends CobolReaderSchema(
- copybook, policy, inputFileNameField, generateRecordId, generateRecordBytes,
+ copybook, schemaRetentionPolicy, strictIntegralPrecision, inputFileNameField, generateRecordId, generateRecordBytes,
generateSegIdFieldsCnt, segmentIdProvidedPrefix
) with Logging with Serializable {
@@ -90,7 +92,7 @@ class CobolSchema(copybook: Copybook,
val redefines = copybook.getAllSegmentRedefines
parseGroup(group, redefines)
}
- val expandRecords = if (policy == SchemaRetentionPolicy.CollapseRoot || copybook.isFlatCopybook) {
+ val expandRecords = if (schemaRetentionPolicy == SchemaRetentionPolicy.CollapseRoot || copybook.isFlatCopybook) {
// Expand root group fields
records.toArray.flatMap(group => group.dataType.asInstanceOf[StructType].fields)
} else {
@@ -181,7 +183,9 @@ class CobolSchema(copybook: Copybook,
case Some(RAW) => BinaryType
case _ => StringType
}
- case dt: Integral =>
+ case dt: Integral if strictIntegralPrecision =>
+ DecimalType(precision = dt.precision, scale = 0)
+ case dt: Integral =>
if (dt.precision > Constants.maxLongPrecision) {
DecimalType(precision = dt.precision, scale = 0)
} else if (dt.precision > Constants.maxIntegerPrecision) {
@@ -326,6 +330,7 @@ object CobolSchema {
new CobolSchema(
schema.copybook,
schema.policy,
+ schema.strictIntegralPrecision,
schema.inputFileNameField,
schema.generateRecordId,
schema.generateRecordBytes,
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala
index be750523..79822d20 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala
@@ -102,6 +102,6 @@ class CobolSchemaHierarchicalSpec extends AnyWordSpec {
private def parseSchema(copybook: String, segmentRedefines: List[String], fieldParentMap: Map[String, String]): CobolSchema = {
val parsedSchema = CopybookParser.parseTree(copybook, segmentRedefines = segmentRedefines, fieldParentMap = fieldParentMap)
- new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "",false, false)
+ new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false)
}
}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
index 9d275e8c..f047614a 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
@@ -16,7 +16,7 @@
package za.co.absa.cobrix.spark.cobol
-import org.apache.spark.sql.types.{ArrayType, IntegerType, LongType, StringType, StructType}
+import org.apache.spark.sql.types.{ArrayType, DecimalType, IntegerType, LongType, StringType, StructType}
import org.scalatest.wordspec.AnyWordSpec
import org.slf4j.{Logger, LoggerFactory}
import za.co.absa.cobrix.cobol.parser.CopybookParser
@@ -54,7 +54,25 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false)
+ val actualSchema = cobolSchema.getSparkSchema.treeString
+
+ assertEqualsMultiline(actualSchema, expectedSchema)
+ }
+
+ "Derive integral strict precision Spark schema from a Copybook" in {
+ val expectedSchema =
+ """root
+ | |-- BIN_INT: decimal(4,0) (nullable = true)
+ | |-- STRUCT_FLD: struct (nullable = true)
+ | | |-- STR_FLD: string (nullable = true)
+ | |-- DATA_STRUCT: struct (nullable = true)
+ | | |-- EXAMPLE_INT_FLD: decimal(7,0) (nullable = true)
+ | | |-- EXAMPLE_STR_FLD: string (nullable = true)
+ |""".stripMargin.replaceAll("[\\r\\n]", "\n")
+
+ val parsedSchema = CopybookParser.parseTree(copyBookContents)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, true, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -75,7 +93,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -94,7 +112,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -116,7 +134,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -144,7 +162,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", true, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -160,7 +178,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -177,7 +195,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -192,7 +210,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -210,7 +228,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -224,7 +242,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -253,7 +271,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", true, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -271,7 +289,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, true, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, true, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -292,7 +310,28 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", true, true, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, true, 2)
+ val actualSchema = cobolSchema.getSparkSchema.treeString
+
+ assertEqualsMultiline(actualSchema, expectedSchema)
+ }
+
+ "multi-segment keep-original with record id and bytes generation and strict integral precision" in {
+ val expectedSchema =
+ """root
+ | |-- File_Id: integer (nullable = false)
+ | |-- Record_Id: long (nullable = false)
+ | |-- Record_Byte_Length: integer (nullable = false)
+ | |-- Record_Bytes: binary (nullable = false)
+ | |-- Seg_Id0: string (nullable = true)
+ | |-- Seg_Id1: string (nullable = true)
+ | |-- STRUCT1: struct (nullable = true)
+ | | |-- IntValue: decimal(6,0) (nullable = true)
+ | |-- STRUCT2: struct (nullable = true)
+ | | |-- STR_FLD: string (nullable = true)
+ |""".stripMargin.replaceAll("[\\r\\n]", "\n")
+ val parsedSchema = CopybookParser.parseTree(copyBook)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, true, "", true, true, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -309,7 +348,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -327,7 +366,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -342,7 +381,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -361,7 +400,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, false)
+ val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false)
val actualSparkSchema = cobolSchema1.getSparkSchema
val rootField = actualSparkSchema.fields.head.dataType.asInstanceOf[StructType]
@@ -390,7 +429,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false)
+ val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false)
val actualSparkSchema = cobolSchema1.getSparkSchema
val metadataStr1 = actualSparkSchema.fields.head.metadata
@@ -429,6 +468,27 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
assert(sparkSchema.fields(2).dataType == IntegerType)
}
+ "return a schema for a copybook with strict decimal" in {
+ val copybook: String =
+ """ 01 RECORD.
+ | 05 STR1 PIC X(10).
+ | 05 NUM2 PIC S9(12).
+ | 05 NUM3 PIC 9(7).
+ |""".stripMargin
+
+ val cobolSchema = CobolSchema.fromSparkOptions(Seq(copybook), Map("strict_integral_precision" -> "true"))
+
+ val sparkSchema = cobolSchema.getSparkSchema
+
+ assert(sparkSchema.fields.length == 3)
+ assert(sparkSchema.fields.head.name == "STR1")
+ assert(sparkSchema.fields.head.dataType == StringType)
+ assert(sparkSchema.fields(1).name == "NUM2")
+ assert(sparkSchema.fields(1).dataType == DecimalType(12, 0))
+ assert(sparkSchema.fields(2).name == "NUM3")
+ assert(sparkSchema.fields(2).dataType == DecimalType(7, 0))
+ }
+
"return a schema for multiple copybooks" in {
val copybook1: String =
""" 01 RECORD1.
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala
index 269c2ab8..a4604bf9 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala
@@ -37,7 +37,7 @@ class SparkSchemaSpec extends AnyFunSuite {
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "",false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false)
val sparkSchema = cobolSchema.getSparkSchema
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
index 33a8fc3b..2a9a7cc8 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
@@ -24,7 +24,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
import scala.collection.Seq
-class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, "", false, false) with Serializable {
+class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, false, "", false, false) with Serializable {
override def getSparkSchema: StructType = sparkSchema
override lazy val getRecordSize: Int = 40
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test38StrictIntegralPrecisionSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test38StrictIntegralPrecisionSpec.scala
new file mode 100644
index 00000000..fff54f62
--- /dev/null
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test38StrictIntegralPrecisionSpec.scala
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.source.integration
+
+import org.scalatest.wordspec.AnyWordSpec
+import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
+import za.co.absa.cobrix.spark.cobol.source.fixtures.{BinaryFileFixture, TextComparisonFixture}
+
+class Test38StrictIntegralPrecisionSpec extends AnyWordSpec with SparkTestBase with BinaryFileFixture with TextComparisonFixture {
+ private val copybook =
+ """ 01 R.
+ 03 SEG-ID PIC X(1).
+ 03 SEG1.
+ 05 NUM1 PIC 9(2).
+ 03 SEG2 REDEFINES SEG1.
+ 05 NUM2 PIC S9(9).
+ 03 SEG3 REDEFINES SEG1.
+ 05 NUM3 PIC S9(15).
+ """
+
+ val dataSimple: Array[Byte] = Array(
+ 0xC1, 0xF1, 0xF2, 0xF3, // record 0 'A12'
+ 0xC2, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, // record 1 'B123456789'
+ 0xC3, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
+ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5).map(_.toByte) // record 2 'C123456789012345'
+
+ "non-strict decimals" should {
+ "be decoded normally" in {
+ withTempBinFile("strict_integral_precision", ".tmp", dataSimple) { tempFile =>
+ val expectedSchema =
+ """root
+ | |-- SEG_ID: string (nullable = true)
+ | |-- SEG1: struct (nullable = true)
+ | | |-- NUM1: integer (nullable = true)
+ | |-- SEG2: struct (nullable = true)
+ | | |-- NUM2: integer (nullable = true)
+ | |-- SEG3: struct (nullable = true)
+ | | |-- NUM3: long (nullable = true)""".stripMargin
+
+ val expectedData = """{"SEG_ID":"A","SEG1":{"NUM1":12}},{"SEG_ID":"B","SEG2":{"NUM2":123456789}},{"SEG_ID":"C","SEG3":{"NUM3":123456789012345}}"""
+
+ val df = spark.read
+ .format("cobol")
+ .option("copybook_contents", copybook)
+ .option("record_format", "F")
+ .option("record_length_field", "SEG-ID")
+ .option("segment_field", "SEG-ID")
+ .option("input_split_records", "2")
+ .option("pedantic", "true")
+ .option("record_length_map", """{"A":4,"B":10,"C":16}""")
+ .option("redefine_segment_id_map:0", "SEG1 => A")
+ .option("redefine_segment_id_map:1", "SEG2 => B")
+ .option("redefine_segment_id_map:2", "SEG3 => C")
+ .load(tempFile)
+
+ val actualSchema = df.schema.treeString
+ val actualData = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
+
+ compareText(actualSchema, expectedSchema)
+ assert(actualData == expectedData)
+ }
+ }
+ }
+
+ "strict decimals" should {
+ "be decoded as decimals for normal files" in {
+ withTempBinFile("strict_integral_precision", ".tmp", dataSimple) { tempFile =>
+ val expectedSchema =
+ """root
+ | |-- SEG_ID: string (nullable = true)
+ | |-- SEG1: struct (nullable = true)
+ | | |-- NUM1: decimal(2,0) (nullable = true)
+ | |-- SEG2: struct (nullable = true)
+ | | |-- NUM2: decimal(9,0) (nullable = true)
+ | |-- SEG3: struct (nullable = true)
+ | | |-- NUM3: decimal(15,0) (nullable = true)""".stripMargin
+
+ val expectedData = """{"SEG_ID":"A","SEG1":{"NUM1":12}},{"SEG_ID":"B","SEG2":{"NUM2":123456789}},{"SEG_ID":"C","SEG3":{"NUM3":123456789012345}}"""
+
+ val df = spark.read
+ .format("cobol")
+ .option("copybook_contents", copybook)
+ .option("record_format", "F")
+ .option("record_length_field", "SEG-ID")
+ .option("segment_field", "SEG-ID")
+ .option("input_split_records", "2")
+ .option("pedantic", "true")
+ .option("record_length_map", """{"A":4,"B":10,"C":16}""")
+ .option("redefine_segment_id_map:0", "SEG1 => A")
+ .option("redefine_segment_id_map:1", "SEG2 => B")
+ .option("redefine_segment_id_map:2", "SEG3 => C")
+ .option("strict_integral_precision", "true")
+ .load(tempFile)
+
+ val actualSchema = df.schema.treeString
+ val actualData = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
+
+ compareText(actualSchema, expectedSchema)
+ assert(actualData == expectedData)
+ }
+ }
+
+ "be decoded as decimals for hierarchical files" in {
+ withTempBinFile("strict_integral_precision", ".tmp", dataSimple) { tempFile =>
+ val expectedSchema =
+ """root
+ | |-- SEG_ID: string (nullable = true)
+ | |-- SEG1: struct (nullable = true)
+ | | |-- NUM1: decimal(2,0) (nullable = true)
+ | | |-- SEG2: array (nullable = true)
+ | | | |-- element: struct (containsNull = true)
+ | | | | |-- NUM2: decimal(9,0) (nullable = true)
+ | | |-- SEG3: array (nullable = true)
+ | | | |-- element: struct (containsNull = true)
+ | | | | |-- NUM3: decimal(15,0) (nullable = true)""".stripMargin
+
+ val expectedData = """{"SEG_ID":"A","SEG1":{"NUM1":12,"SEG2":[{"NUM2":123456789}],"SEG3":[{"NUM3":123456789012345}]}}"""
+
+ val df = spark.read
+ .format("cobol")
+ .option("copybook_contents", copybook)
+ .option("record_format", "F")
+ .option("record_length_field", "SEG-ID")
+ .option("segment_field", "SEG-ID")
+ .option("input_split_records", "2")
+ .option("pedantic", "true")
+ .option("record_length_map", """{"A":4,"B":10,"C":16}""")
+ .option("redefine_segment_id_map:0", "SEG1 => A")
+ .option("redefine_segment_id_map:1", "SEG2 => B")
+ .option("redefine_segment_id_map:2", "SEG3 => C")
+ .option("segment-children:1", "SEG1 => SEG2,SEG3")
+
+ .option("strict_integral_precision", "true")
+ .load(tempFile)
+
+ val actualSchema = df.schema.treeString
+ val actualData = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
+
+ compareText(actualSchema, expectedSchema)
+ assert(actualData == expectedData)
+ }
+ }
+ }
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala
index b8bdb3fb..7c7712ae 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala
@@ -52,7 +52,7 @@ class Test02TextFilesOldSchool extends AnyFunSuite with SparkTestBase with Binar
withTempTextFile("text_ascii", ".txt", StandardCharsets.UTF_8, textFileContent) { tmpFileName =>
val parsedCopybook = CopybookParser.parse(copybook, dataEncoding = ASCII, stringTrimmingPolicy = StringTrimmingPolicy.TrimNone)
- val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, "", false)
+ val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, false, "", false)
val sparkSchema = cobolSchema.getSparkSchema
val rddText = spark.sparkContext