diff --git a/README.md b/README.md index 7d7fc45f..8bd24f20 100644 --- a/README.md +++ b/README.md @@ -1517,6 +1517,7 @@ The output looks like this: | .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. | | .option("strict_sign_overpunching", "true") | If `true` (default), sign overpunching will only be allowed for signed numbers. If `false`, overpunched positive sign will be allowed for unsigned numbers, but negative sign will result in null. | | .option("improved_null_detection", "true") | If `true`(default), values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. | +| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. | | .option("binary_as_hex", "false") | By default fields that have `PIC X` and `USAGE COMP` are converted to `binary` Spark data type. If this option is set to `true`, such fields will be strings in HEX encoding. | ##### Modifier options diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala index f4d3e4f0..1e002457 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala @@ -87,12 +87,14 @@ object CopybookParser extends Logging { dropGroupFillers: Boolean = false, dropValueFillers: Boolean = true, commentPolicy: CommentPolicy = CommentPolicy(), - dropFillersFromAst: Boolean = false + dropFillersFromAst: Boolean = false, + strictIntegralPrecision: Boolean = false ): Copybook = { val copybook = parse(copyBookContents = copyBookContents, dropGroupFillers = dropGroupFillers, dropValueFillers = dropValueFillers, - commentPolicy = commentPolicy) + commentPolicy = commentPolicy, + strictIntegralPrecision = strictIntegralPrecision) if (dropFillersFromAst && (dropGroupFillers || dropValueFillers)) { copybook.dropFillers(dropGroupFillers, dropValueFillers) @@ -135,6 +137,7 @@ object CopybookParser extends Logging { commentPolicy: CommentPolicy = CommentPolicy(), strictSignOverpunch: Boolean = true, improvedNullDetection: Boolean = false, + strictIntegralPrecision: Boolean = false, decodeBinaryAsHex: Boolean = false, ebcdicCodePage: CodePage = new CodePageCommon, asciiCharset: Charset = StandardCharsets.US_ASCII, @@ -155,6 +158,7 @@ object CopybookParser extends Logging { commentPolicy, strictSignOverpunch, improvedNullDetection, + strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, @@ -197,6 +201,7 @@ object CopybookParser extends Logging { commentPolicy: CommentPolicy = CommentPolicy(), strictSignOverpunch: Boolean = true, improvedNullDetection: Boolean = false, + strictIntegralPrecision: Boolean = false, decodeBinaryAsHex: Boolean = false, ebcdicCodePage: CodePage = new CodePageCommon, asciiCharset: Charset = StandardCharsets.US_ASCII, @@ -217,6 +222,7 @@ object CopybookParser extends Logging { commentPolicy, strictSignOverpunch, improvedNullDetection, + strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, @@ -262,6 +268,7 @@ object CopybookParser extends Logging { commentPolicy: CommentPolicy, strictSignOverpunch: Boolean, improvedNullDetection: Boolean, + strictIntegralPrecision: Boolean, decodeBinaryAsHex: Boolean, ebcdicCodePage: CodePage, asciiCharset: Charset, @@ -272,7 +279,7 @@ object CopybookParser extends Logging { debugFieldsPolicy: DebugFieldsPolicy, fieldCodePageMap: Map[String, String]): Copybook = { - val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap) + val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap) val nonTerms: Set[String] = (for (id <- nonTerminals) yield transformIdentifier(id) diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala index 6e1364a9..670f4ec5 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala @@ -57,13 +57,14 @@ object ANTLRParser extends Logging { commentPolicy: CommentPolicy, strictSignOverpunch: Boolean, improvedNullDetection: Boolean, + strictIntegralPrecision: Boolean, decodeBinaryAsHex: Boolean, ebcdicCodePage: CodePage, asciiCharset: Charset, isUtf16BigEndian: Boolean, floatingPointFormat: FloatingPointFormat, fieldCodePageMap: Map[String, String]): CopybookAST = { - val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, fieldCodePageMap) + val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap) val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map( line => diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala index 7b102647..20908ad0 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala @@ -47,6 +47,7 @@ class ParserVisitor(enc: Encoding, floatingPointFormat: FloatingPointFormat, strictSignOverpunch: Boolean, improvedNullDetection: Boolean, + strictIntegralPrecision: Boolean, decodeBinaryAsHex: Boolean, fieldCodePageMap: Map[String, String]) extends copybookParserBaseVisitor[Expr] { /* expressions */ @@ -853,7 +854,7 @@ class ParserVisitor(enc: Encoding, Map(), isDependee = false, identifier.toUpperCase() == Constants.FILLER, - DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection) + DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision) ) (Some(parent)) parent.children.append(prim) diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala index b26e5491..e99e04fb 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala @@ -42,13 +42,15 @@ object DecoderSelector { *
  • Integral types are represented as boxed integers and longs. Larger integral numbers are represented as BigDecimal
  • * * - * @param dataType A daatype of a copybook field - * @param stringTrimmingPolicy Specifies how the decoder should handle string types - * @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion - * @param asciiCharset A charset for ASCII encoded data - * @param isUtf16BigEndian If true UTF-16 strings are considered big-endian. - * @param floatingPointFormat Specifies a floating point format (IBM or IEEE754) - * @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers. + * @param dataType A datatype of a copybook field + * @param stringTrimmingPolicy Specifies how the decoder should handle string types + * @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion + * @param asciiCharset A charset for ASCII encoded data + * @param isUtf16BigEndian If true UTF-16 strings are considered big-endian. + * @param floatingPointFormat Specifies a floating point format (IBM or IEEE754) + * @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers. + * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. + * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook. * @return A function that converts an array of bytes to the target data type. */ def getDecoder(dataType: CobolType, @@ -58,11 +60,12 @@ object DecoderSelector { isUtf16BigEndian: Boolean = true, floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM, strictSignOverpunch: Boolean = false, - improvedNullDetection: Boolean = false): Decoder = { + improvedNullDetection: Boolean = false, + strictIntegralPrecision: Boolean = false): Decoder = { val decoder = dataType match { case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection) case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection) - case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection) + case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision) case _ => throw new IllegalStateException("Unknown AST object") } decoder @@ -186,7 +189,8 @@ object DecoderSelector { /** Gets a decoder function for an integral data type. A direct conversion from array of bytes to the target type is used where possible. */ private def getIntegralDecoder(integralType: Integral, strictSignOverpunch: Boolean, - improvedNullDetection: Boolean): Decoder = { + improvedNullDetection: Boolean, + strictIntegralPrecision: Boolean): Decoder = { val encoding = integralType.enc.getOrElse(EBCDIC) val isEbcidic = encoding match { @@ -198,7 +202,12 @@ object DecoderSelector { integralType.compact match { case None => - if (integralType.precision <= Constants.maxIntegerPrecision) { + if (strictIntegralPrecision) { + if (isEbcidic) + StringDecoders.decodeEbcdicBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection) + else + StringDecoders.decodeAsciiBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection) + } else if (integralType.precision <= Constants.maxIntegerPrecision) { if (isEbcidic) StringDecoders.decodeEbcdicInt(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection) else @@ -223,57 +232,60 @@ object DecoderSelector { throw new IllegalStateException("Unexpected error. COMP-2 (double) is incorrect for an integral number.") case Some(COMP3()) => // COMP-3 aka BCD-encoded number - getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true) + getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true, strictIntegralPrecision) case Some(COMP3U()) => // COMP-3U aka Unsigned BCD-encoded number aka Unsigned Packed - getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false) + getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false, strictIntegralPrecision) case Some(COMP4()) => // COMP aka BINARY encoded number - getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true) + getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision) case Some(COMP5()) => // COMP aka BINARY encoded number - getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true) + getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision) case Some(COMP9()) => // COMP aka BINARY encoded number - getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false) + getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false, strictIntegralPrecision) case _ => throw new IllegalStateException(s"Unknown number compression format (${integralType.compact.get}).") } } /** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */ - private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean): Decoder = { + private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = { val isSigned = signPosition.nonEmpty - val isSignLeft = signPosition.forall(sp => if (sp == za.co.absa.cobrix.cobol.parser.position.Left) true else false) val numOfBytes = BinaryUtils.getBytesCount(compact, precision, isSigned, isExplicitDecimalPt = false, isSignSeparate = false) - val decoder = (isSigned, isBigEndian, numOfBytes) match { - case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _ - case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _ - case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _ - case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _ - case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _ - case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _ - case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _ - case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _ - case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _ - case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _ - case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _ - case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _ - case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _ - case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _ - case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _ - case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _ - case _ => - (a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned) + val decoder = if (strictIntegralPrecision) { + (a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned) + } else { + (isSigned, isBigEndian, numOfBytes) match { + case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _ + case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _ + case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _ + case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _ + case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _ + case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _ + case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _ + case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _ + case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _ + case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _ + case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _ + case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _ + case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _ + case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _ + case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _ + case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _ + case _ => + (a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned) + } } decoder // 999 999 999 } /** Gets a decoder function for a BCD-encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */ - private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean): Decoder = { + private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean, strictIntegralPrecision: Boolean): Decoder = { val decoder = - if (precision <= Constants.maxIntegerPrecision) { + if (precision <= Constants.maxIntegerPrecision && !strictIntegralPrecision) { a: Array[Byte] => { val num = BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble) if (num != null) { @@ -282,7 +294,7 @@ object DecoderSelector { null } } - } else if (precision <= Constants.maxLongPrecision) { + } else if (precision <= Constants.maxLongPrecision && !strictIntegralPrecision) { a: Array[Byte] => BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble) } else { a: Array[Byte] => diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala index ae1ebb19..efda1f4d 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala @@ -30,20 +30,20 @@ object RecordExtractors { /** * This method extracts a record from the specified array of bytes. The copybook for the record needs to be already parsed. * - * @param ast The parsed copybook. - * @param data The data bits containing the record. - * @param offsetBytes The offset to the beginning of the record (in bits). - * @param policy A schema retention policy to be applied to the extracted record. - * @param variableLengthOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements. - * @param generateRecordId If true, a record id field will be added as the first field of the record. - * @param generateRecordBytes If true, a record bytes field will be added at the beginning of each record. - * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on. - * @param fileId A file id to be put to the extractor record if generateRecordId == true. - * @param recordId The record id to be saved to the record id field. - * @param activeSegmentRedefine An active segment redefine (the one that will be parsed). - * All other segment redefines will be skipped. - * @param generateInputFileField if true, a field containing input file name will be generated - * @param inputFileName An input file name to put if its generation is needed + * @param ast The parsed copybook. + * @param data The data bits containing the record. + * @param offsetBytes The offset to the beginning of the record (in bits). + * @param policy A schema retention policy to be applied to the extracted record. + * @param variableLengthOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements. + * @param generateRecordId If true, a record id field will be added as the first field of the record. + * @param generateRecordBytes If true, a record bytes field will be added at the beginning of each record. + * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on. + * @param fileId A file id to be put to the extractor record if generateRecordId == true. + * @param recordId The record id to be saved to the record id field. + * @param activeSegmentRedefine An active segment redefine (the one that will be parsed). + * All other segment redefines will be skipped. + * @param generateInputFileField if true, a field containing input file name will be generated + * @param inputFileName An input file name to put if its generation is needed * @return An Array[Any] object corresponding to the record schema. */ @throws(classOf[IllegalStateException]) diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala index 5ee95651..e9efdcc2 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala @@ -26,77 +26,79 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten /** * This class holds parameters for the job. * - * @param copybookPath String containing the path to the copybook in a given file system. - * @param multiCopybookPath Sequence containing the paths to the copybooks. - * @param copybookContent String containing the actual content of the copybook. Either this, the copybookPath, or multiCopybookPath parameter must be specified. - * @param sourcePaths The list of source file paths. - * @param recordFormat The record format (F, V, VB, D) - * @param isText [deprecated by recordFormat] If true the input data consists of text files where records are separated by a line ending character - * @param isEbcdic If true the input data file encoding is EBCDIC, otherwise it is ASCII - * @param ebcdicCodePage Specifies what code page to use for EBCDIC to ASCII/Unicode conversions - * @param ebcdicCodePageClass An optional custom code page conversion class provided by a user - * @param asciiCharset A charset for ASCII data - * @param fieldCodePage Specifies a mapping between a field name and the code page - * @param isUtf16BigEndian If true UTF-16 is considered big-endian. - * @param floatingPointFormat A format of floating-point numbers - * @param recordStartOffset A number of bytes to skip at the beginning of the record before parsing a record according to a copybook - * @param recordEndOffset A number of bytes to skip at the end of each record - * @param recordLength Specifies the length of the record disregarding the copybook record size. Implied the file has fixed record length. - * @param minimumRecordLength Minium record length for which the record is considered valid. - * @param maximumRecordLength Maximum record length for which the record is considered valid. - * @param variableLengthParams VariableLengthParameters containing the specifications for the consumption of variable-length Cobol records. - * @param variableSizeOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements - * @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record - * @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed - * @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed - * @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example) - * @param multisegmentParams Parameters for reading multisegment mainframe files - * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. - * @param decodeBinaryAsHex Decode binary fields as HEX strings - * @param commentPolicy A comment truncation policy - * @param dropGroupFillers If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields - * @param dropValueFillers If true the parser will drop all value FILLER fields - * @param nonTerminals A list of non-terminals (GROUPS) to combine and parse as primitive fields - * @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw). - * @param debugIgnoreFileSize If true the fixed length file reader won't check file size divisibility. Useful for debugging binary file / copybook mismatches. - * @param metadataPolicy Specifies the policy of metadat fields to be added to the Spark schema + * @param copybookPath String containing the path to the copybook in a given file system. + * @param multiCopybookPath Sequence containing the paths to the copybooks. + * @param copybookContent String containing the actual content of the copybook. Either this, the copybookPath, or multiCopybookPath parameter must be specified. + * @param sourcePaths The list of source file paths. + * @param recordFormat The record format (F, V, VB, D) + * @param isText [deprecated by recordFormat] If true the input data consists of text files where records are separated by a line ending character + * @param isEbcdic If true the input data file encoding is EBCDIC, otherwise it is ASCII + * @param ebcdicCodePage Specifies what code page to use for EBCDIC to ASCII/Unicode conversions + * @param ebcdicCodePageClass An optional custom code page conversion class provided by a user + * @param asciiCharset A charset for ASCII data + * @param fieldCodePage Specifies a mapping between a field name and the code page + * @param isUtf16BigEndian If true UTF-16 is considered big-endian. + * @param floatingPointFormat A format of floating-point numbers + * @param recordStartOffset A number of bytes to skip at the beginning of the record before parsing a record according to a copybook + * @param recordEndOffset A number of bytes to skip at the end of each record + * @param recordLength Specifies the length of the record disregarding the copybook record size. Implied the file has fixed record length. + * @param minimumRecordLength Minium record length for which the record is considered valid. + * @param maximumRecordLength Maximum record length for which the record is considered valid. + * @param variableLengthParams VariableLengthParameters containing the specifications for the consumption of variable-length Cobol records. + * @param variableSizeOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements + * @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record + * @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed + * @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed + * @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example) + * @param multisegmentParams Parameters for reading multisegment mainframe files + * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. + * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook. + * @param decodeBinaryAsHex Decode binary fields as HEX strings + * @param commentPolicy A comment truncation policy + * @param dropGroupFillers If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields + * @param dropValueFillers If true the parser will drop all value FILLER fields + * @param nonTerminals A list of non-terminals (GROUPS) to combine and parse as primitive fields + * @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw). + * @param debugIgnoreFileSize If true the fixed length file reader won't check file size divisibility. Useful for debugging binary file / copybook mismatches. + * @param metadataPolicy Specifies the policy of metadat fields to be added to the Spark schema */ case class CobolParameters( - copybookPath: Option[String], - multiCopybookPath: Seq[String], - copybookContent: Option[String], - sourcePaths: Seq[String], - recordFormat: RecordFormat, - isText: Boolean, - isEbcdic: Boolean, - ebcdicCodePage: String, - ebcdicCodePageClass: Option[String], - asciiCharset: Option[String], - fieldCodePage: Map[String, String], - isUtf16BigEndian: Boolean, - floatingPointFormat: FloatingPointFormat, - recordStartOffset: Int, - recordEndOffset: Int, - recordLength: Option[Int], - minimumRecordLength: Option[Int], - maximumRecordLength: Option[Int], - variableLengthParams: Option[VariableLengthParameters], - variableSizeOccurs: Boolean, - generateRecordBytes: Boolean, - schemaRetentionPolicy: SchemaRetentionPolicy, - stringTrimmingPolicy: StringTrimmingPolicy, - allowPartialRecords: Boolean, - multisegmentParams: Option[MultisegmentParameters], - commentPolicy: CommentPolicy, - strictSignOverpunch: Boolean, - improvedNullDetection: Boolean, - decodeBinaryAsHex: Boolean, - dropGroupFillers: Boolean, - dropValueFillers: Boolean, - fillerNamingPolicy: FillerNamingPolicy, - nonTerminals: Seq[String], - occursMappings: Map[String, Map[String, Int]], - debugFieldsPolicy: DebugFieldsPolicy, - debugIgnoreFileSize: Boolean, - metadataPolicy: MetadataPolicy + copybookPath: Option[String], + multiCopybookPath: Seq[String], + copybookContent: Option[String], + sourcePaths: Seq[String], + recordFormat: RecordFormat, + isText: Boolean, + isEbcdic: Boolean, + ebcdicCodePage: String, + ebcdicCodePageClass: Option[String], + asciiCharset: Option[String], + fieldCodePage: Map[String, String], + isUtf16BigEndian: Boolean, + floatingPointFormat: FloatingPointFormat, + recordStartOffset: Int, + recordEndOffset: Int, + recordLength: Option[Int], + minimumRecordLength: Option[Int], + maximumRecordLength: Option[Int], + variableLengthParams: Option[VariableLengthParameters], + variableSizeOccurs: Boolean, + generateRecordBytes: Boolean, + schemaRetentionPolicy: SchemaRetentionPolicy, + stringTrimmingPolicy: StringTrimmingPolicy, + allowPartialRecords: Boolean, + multisegmentParams: Option[MultisegmentParameters], + commentPolicy: CommentPolicy, + strictSignOverpunch: Boolean, + improvedNullDetection: Boolean, + strictIntegralPrecision: Boolean, + decodeBinaryAsHex: Boolean, + dropGroupFillers: Boolean, + dropValueFillers: Boolean, + fillerNamingPolicy: FillerNamingPolicy, + nonTerminals: Seq[String], + occursMappings: Map[String, Map[String, Int]], + debugFieldsPolicy: DebugFieldsPolicy, + debugIgnoreFileSize: Boolean, + metadataPolicy: MetadataPolicy ) diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala index b002d1ed..36c546ff 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala @@ -63,6 +63,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten * @param multisegment Parameters specific to reading multisegment files * @param commentPolicy A comment truncation policy * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. + * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook. * @param decodeBinaryAsHex Decode binary fields as HEX strings * @param dropGroupFillers If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields * @param dropValueFillers If true the parser will drop all value FILLER fields @@ -112,6 +113,7 @@ case class ReaderParameters( commentPolicy: CommentPolicy = CommentPolicy(), strictSignOverpunch: Boolean = true, improvedNullDetection: Boolean = false, + strictIntegralPrecision: Boolean = false, decodeBinaryAsHex: Boolean = false, dropGroupFillers: Boolean = false, dropValueFillers: Boolean = true, diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala index 4411b91f..f45f67a7 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala @@ -36,6 +36,7 @@ import scala.collection.immutable.HashMap * * @param copybook A parsed copybook. * @param policy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook. + * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook. * @param generateRecordId If true, a record id field will be prepended to the beginning of the schema. * @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema. * @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema. @@ -45,6 +46,7 @@ import scala.collection.immutable.HashMap */ class CobolSchema(val copybook: Copybook, val policy: SchemaRetentionPolicy, + val strictIntegralPrecision: Boolean, val inputFileNameField: String, val generateRecordId: Boolean, val generateRecordBytes: Boolean, @@ -94,6 +96,7 @@ object CobolSchema { readerParameters.commentPolicy, readerParameters.strictSignOverpunch, readerParameters.improvedNullDetection, + readerParameters.strictIntegralPrecision, readerParameters.decodeBinaryAsHex, codePage, asciiCharset, @@ -116,6 +119,7 @@ object CobolSchema { readerParameters.commentPolicy, readerParameters.strictSignOverpunch, readerParameters.improvedNullDetection, + readerParameters.strictIntegralPrecision, readerParameters.decodeBinaryAsHex, codePage, asciiCharset, @@ -128,7 +132,7 @@ object CobolSchema { )) val segIdFieldCount = readerParameters.multisegment.map(p => p.segmentLevelIds.size).getOrElse(0) val segmentIdPrefix = readerParameters.multisegment.map(p => p.segmentIdPrefix).getOrElse("") - new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy) + new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.strictIntegralPrecision, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy) } def getCodePage(codePageName: String, codePageClass: Option[String]): CodePage = { diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala index 083ca342..8c6cd3a9 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala @@ -40,6 +40,7 @@ class DataSizeSpec extends AnyFunSuite { floatingPointFormat = FloatingPointFormat.IBM, strictSignOverpunch = true, improvedNullDetection = false, + strictIntegralPrecision = false, decodeBinaryAsHex = false, fieldCodePageMap = Map.empty) diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala index 6220c5eb..020b00ff 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala @@ -40,6 +40,7 @@ class PicValidationSpec extends AnyFunSuite { FloatingPointFormat.IBM, strictSignOverpunch = true, improvedNullDetection = false, + strictIntegralPrecision = false, decodeBinaryAsHex = false, fieldCodePageMap = Map.empty) diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala index a94e4b4f..6fc78d8d 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala @@ -71,6 +71,7 @@ object CobolParametersParser extends Logging { val PARAM_GROUP_FILLERS = "drop_group_fillers" val PARAM_VALUE_FILLERS = "drop_value_fillers" val PARAM_FILLER_NAMING_POLICY = "filler_naming_policy" + val PARAM_STRICT_INTEGRAL_PRECISION = "strict_integral_precision" val PARAM_GROUP_NOT_TERMINALS = "non_terminals" val PARAM_OCCURS_MAPPINGS = "occurs_mappings" @@ -272,6 +273,7 @@ object CobolParametersParser extends Logging { parseCommentTruncationPolicy(params), params.getOrElse(PARAM_STRICT_SIGN_OVERPUNCHING, "true").toBoolean, params.getOrElse(PARAM_IMPROVED_NULL_DETECTION, "true").toBoolean, + params.getOrElse(PARAM_STRICT_INTEGRAL_PRECISION, "false").toBoolean, params.getOrElse(PARAM_BINARY_AS_HEX, "false").toBoolean, params.getOrElse(PARAM_GROUP_FILLERS, "false").toBoolean, params.getOrElse(PARAM_VALUE_FILLERS, "true").toBoolean, @@ -408,6 +410,7 @@ object CobolParametersParser extends Logging { parameters.commentPolicy, parameters.strictSignOverpunch, parameters.improvedNullDetection, + parameters.strictIntegralPrecision, parameters.decodeBinaryAsHex, parameters.dropGroupFillers, parameters.dropValueFillers, diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala index 470a5139..490c185b 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala @@ -39,7 +39,8 @@ import scala.collection.mutable.ArrayBuffer * provides the corresponding Spark schema and also other properties for the Spark data source. * * @param copybook A parsed copybook. - * @param policy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook. + * @param schemaRetentionPolicy pecifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook. + * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook. * @param generateRecordId If true, a record id field will be prepended to the beginning of the schema. * @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema. * @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema. @@ -48,15 +49,16 @@ import scala.collection.mutable.ArrayBuffer * @param metadataPolicy Specifies a policy to generate metadata fields. */ class CobolSchema(copybook: Copybook, - policy: SchemaRetentionPolicy, - inputFileNameField: String, - generateRecordId: Boolean, + schemaRetentionPolicy: SchemaRetentionPolicy, + strictIntegralPrecision: Boolean = false, + inputFileNameField: String = "", + generateRecordId: Boolean = false, generateRecordBytes: Boolean = false, generateSegIdFieldsCnt: Int = 0, segmentIdProvidedPrefix: String = "", metadataPolicy: MetadataPolicy = MetadataPolicy.Basic) extends CobolReaderSchema( - copybook, policy, inputFileNameField, generateRecordId, generateRecordBytes, + copybook, schemaRetentionPolicy, strictIntegralPrecision, inputFileNameField, generateRecordId, generateRecordBytes, generateSegIdFieldsCnt, segmentIdProvidedPrefix ) with Logging with Serializable { @@ -90,7 +92,7 @@ class CobolSchema(copybook: Copybook, val redefines = copybook.getAllSegmentRedefines parseGroup(group, redefines) } - val expandRecords = if (policy == SchemaRetentionPolicy.CollapseRoot || copybook.isFlatCopybook) { + val expandRecords = if (schemaRetentionPolicy == SchemaRetentionPolicy.CollapseRoot || copybook.isFlatCopybook) { // Expand root group fields records.toArray.flatMap(group => group.dataType.asInstanceOf[StructType].fields) } else { @@ -181,7 +183,9 @@ class CobolSchema(copybook: Copybook, case Some(RAW) => BinaryType case _ => StringType } - case dt: Integral => + case dt: Integral if strictIntegralPrecision => + DecimalType(precision = dt.precision, scale = 0) + case dt: Integral => if (dt.precision > Constants.maxLongPrecision) { DecimalType(precision = dt.precision, scale = 0) } else if (dt.precision > Constants.maxIntegerPrecision) { @@ -326,6 +330,7 @@ object CobolSchema { new CobolSchema( schema.copybook, schema.policy, + schema.strictIntegralPrecision, schema.inputFileNameField, schema.generateRecordId, schema.generateRecordBytes, diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala index be750523..79822d20 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala @@ -102,6 +102,6 @@ class CobolSchemaHierarchicalSpec extends AnyWordSpec { private def parseSchema(copybook: String, segmentRedefines: List[String], fieldParentMap: Map[String, String]): CobolSchema = { val parsedSchema = CopybookParser.parseTree(copybook, segmentRedefines = segmentRedefines, fieldParentMap = fieldParentMap) - new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "",false, false) + new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false) } } diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala index 9d275e8c..f047614a 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala @@ -16,7 +16,7 @@ package za.co.absa.cobrix.spark.cobol -import org.apache.spark.sql.types.{ArrayType, IntegerType, LongType, StringType, StructType} +import org.apache.spark.sql.types.{ArrayType, DecimalType, IntegerType, LongType, StringType, StructType} import org.scalatest.wordspec.AnyWordSpec import org.slf4j.{Logger, LoggerFactory} import za.co.absa.cobrix.cobol.parser.CopybookParser @@ -54,7 +54,25 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false) + val actualSchema = cobolSchema.getSparkSchema.treeString + + assertEqualsMultiline(actualSchema, expectedSchema) + } + + "Derive integral strict precision Spark schema from a Copybook" in { + val expectedSchema = + """root + | |-- BIN_INT: decimal(4,0) (nullable = true) + | |-- STRUCT_FLD: struct (nullable = true) + | | |-- STR_FLD: string (nullable = true) + | |-- DATA_STRUCT: struct (nullable = true) + | | |-- EXAMPLE_INT_FLD: decimal(7,0) (nullable = true) + | | |-- EXAMPLE_STR_FLD: string (nullable = true) + |""".stripMargin.replaceAll("[\\r\\n]", "\n") + + val parsedSchema = CopybookParser.parseTree(copyBookContents) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, true, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -75,7 +93,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -94,7 +112,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -116,7 +134,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -144,7 +162,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", true, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -160,7 +178,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -177,7 +195,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -192,7 +210,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -210,7 +228,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -224,7 +242,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -253,7 +271,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", true, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -271,7 +289,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, true, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, true, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -292,7 +310,28 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", true, true, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, true, 2) + val actualSchema = cobolSchema.getSparkSchema.treeString + + assertEqualsMultiline(actualSchema, expectedSchema) + } + + "multi-segment keep-original with record id and bytes generation and strict integral precision" in { + val expectedSchema = + """root + | |-- File_Id: integer (nullable = false) + | |-- Record_Id: long (nullable = false) + | |-- Record_Byte_Length: integer (nullable = false) + | |-- Record_Bytes: binary (nullable = false) + | |-- Seg_Id0: string (nullable = true) + | |-- Seg_Id1: string (nullable = true) + | |-- STRUCT1: struct (nullable = true) + | | |-- IntValue: decimal(6,0) (nullable = true) + | |-- STRUCT2: struct (nullable = true) + | | |-- STR_FLD: string (nullable = true) + |""".stripMargin.replaceAll("[\\r\\n]", "\n") + val parsedSchema = CopybookParser.parseTree(copyBook) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, true, "", true, true, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -309,7 +348,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -327,7 +366,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", true, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -342,7 +381,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -361,7 +400,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, "", false, false) + val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false) val actualSparkSchema = cobolSchema1.getSparkSchema val rootField = actualSparkSchema.fields.head.dataType.asInstanceOf[StructType] @@ -390,7 +429,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "", false, false) + val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false) val actualSparkSchema = cobolSchema1.getSparkSchema val metadataStr1 = actualSparkSchema.fields.head.metadata @@ -429,6 +468,27 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { assert(sparkSchema.fields(2).dataType == IntegerType) } + "return a schema for a copybook with strict decimal" in { + val copybook: String = + """ 01 RECORD. + | 05 STR1 PIC X(10). + | 05 NUM2 PIC S9(12). + | 05 NUM3 PIC 9(7). + |""".stripMargin + + val cobolSchema = CobolSchema.fromSparkOptions(Seq(copybook), Map("strict_integral_precision" -> "true")) + + val sparkSchema = cobolSchema.getSparkSchema + + assert(sparkSchema.fields.length == 3) + assert(sparkSchema.fields.head.name == "STR1") + assert(sparkSchema.fields.head.dataType == StringType) + assert(sparkSchema.fields(1).name == "NUM2") + assert(sparkSchema.fields(1).dataType == DecimalType(12, 0)) + assert(sparkSchema.fields(2).name == "NUM3") + assert(sparkSchema.fields(2).dataType == DecimalType(7, 0)) + } + "return a schema for multiple copybooks" in { val copybook1: String = """ 01 RECORD1. diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala index 269c2ab8..a4604bf9 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala @@ -37,7 +37,7 @@ class SparkSchemaSpec extends AnyFunSuite { val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, "",false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false) val sparkSchema = cobolSchema.getSparkSchema diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala index 33a8fc3b..2a9a7cc8 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala @@ -24,7 +24,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy import scala.collection.Seq -class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, "", false, false) with Serializable { +class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, false, "", false, false) with Serializable { override def getSparkSchema: StructType = sparkSchema override lazy val getRecordSize: Int = 40 diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test38StrictIntegralPrecisionSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test38StrictIntegralPrecisionSpec.scala new file mode 100644 index 00000000..fff54f62 --- /dev/null +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test38StrictIntegralPrecisionSpec.scala @@ -0,0 +1,158 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.spark.cobol.source.integration + +import org.scalatest.wordspec.AnyWordSpec +import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase +import za.co.absa.cobrix.spark.cobol.source.fixtures.{BinaryFileFixture, TextComparisonFixture} + +class Test38StrictIntegralPrecisionSpec extends AnyWordSpec with SparkTestBase with BinaryFileFixture with TextComparisonFixture { + private val copybook = + """ 01 R. + 03 SEG-ID PIC X(1). + 03 SEG1. + 05 NUM1 PIC 9(2). + 03 SEG2 REDEFINES SEG1. + 05 NUM2 PIC S9(9). + 03 SEG3 REDEFINES SEG1. + 05 NUM3 PIC S9(15). + """ + + val dataSimple: Array[Byte] = Array( + 0xC1, 0xF1, 0xF2, 0xF3, // record 0 'A12' + 0xC2, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, // record 1 'B123456789' + 0xC3, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5).map(_.toByte) // record 2 'C123456789012345' + + "non-strict decimals" should { + "be decoded normally" in { + withTempBinFile("strict_integral_precision", ".tmp", dataSimple) { tempFile => + val expectedSchema = + """root + | |-- SEG_ID: string (nullable = true) + | |-- SEG1: struct (nullable = true) + | | |-- NUM1: integer (nullable = true) + | |-- SEG2: struct (nullable = true) + | | |-- NUM2: integer (nullable = true) + | |-- SEG3: struct (nullable = true) + | | |-- NUM3: long (nullable = true)""".stripMargin + + val expectedData = """{"SEG_ID":"A","SEG1":{"NUM1":12}},{"SEG_ID":"B","SEG2":{"NUM2":123456789}},{"SEG_ID":"C","SEG3":{"NUM3":123456789012345}}""" + + val df = spark.read + .format("cobol") + .option("copybook_contents", copybook) + .option("record_format", "F") + .option("record_length_field", "SEG-ID") + .option("segment_field", "SEG-ID") + .option("input_split_records", "2") + .option("pedantic", "true") + .option("record_length_map", """{"A":4,"B":10,"C":16}""") + .option("redefine_segment_id_map:0", "SEG1 => A") + .option("redefine_segment_id_map:1", "SEG2 => B") + .option("redefine_segment_id_map:2", "SEG3 => C") + .load(tempFile) + + val actualSchema = df.schema.treeString + val actualData = df.orderBy("SEG_ID").toJSON.collect().mkString(",") + + compareText(actualSchema, expectedSchema) + assert(actualData == expectedData) + } + } + } + + "strict decimals" should { + "be decoded as decimals for normal files" in { + withTempBinFile("strict_integral_precision", ".tmp", dataSimple) { tempFile => + val expectedSchema = + """root + | |-- SEG_ID: string (nullable = true) + | |-- SEG1: struct (nullable = true) + | | |-- NUM1: decimal(2,0) (nullable = true) + | |-- SEG2: struct (nullable = true) + | | |-- NUM2: decimal(9,0) (nullable = true) + | |-- SEG3: struct (nullable = true) + | | |-- NUM3: decimal(15,0) (nullable = true)""".stripMargin + + val expectedData = """{"SEG_ID":"A","SEG1":{"NUM1":12}},{"SEG_ID":"B","SEG2":{"NUM2":123456789}},{"SEG_ID":"C","SEG3":{"NUM3":123456789012345}}""" + + val df = spark.read + .format("cobol") + .option("copybook_contents", copybook) + .option("record_format", "F") + .option("record_length_field", "SEG-ID") + .option("segment_field", "SEG-ID") + .option("input_split_records", "2") + .option("pedantic", "true") + .option("record_length_map", """{"A":4,"B":10,"C":16}""") + .option("redefine_segment_id_map:0", "SEG1 => A") + .option("redefine_segment_id_map:1", "SEG2 => B") + .option("redefine_segment_id_map:2", "SEG3 => C") + .option("strict_integral_precision", "true") + .load(tempFile) + + val actualSchema = df.schema.treeString + val actualData = df.orderBy("SEG_ID").toJSON.collect().mkString(",") + + compareText(actualSchema, expectedSchema) + assert(actualData == expectedData) + } + } + + "be decoded as decimals for hierarchical files" in { + withTempBinFile("strict_integral_precision", ".tmp", dataSimple) { tempFile => + val expectedSchema = + """root + | |-- SEG_ID: string (nullable = true) + | |-- SEG1: struct (nullable = true) + | | |-- NUM1: decimal(2,0) (nullable = true) + | | |-- SEG2: array (nullable = true) + | | | |-- element: struct (containsNull = true) + | | | | |-- NUM2: decimal(9,0) (nullable = true) + | | |-- SEG3: array (nullable = true) + | | | |-- element: struct (containsNull = true) + | | | | |-- NUM3: decimal(15,0) (nullable = true)""".stripMargin + + val expectedData = """{"SEG_ID":"A","SEG1":{"NUM1":12,"SEG2":[{"NUM2":123456789}],"SEG3":[{"NUM3":123456789012345}]}}""" + + val df = spark.read + .format("cobol") + .option("copybook_contents", copybook) + .option("record_format", "F") + .option("record_length_field", "SEG-ID") + .option("segment_field", "SEG-ID") + .option("input_split_records", "2") + .option("pedantic", "true") + .option("record_length_map", """{"A":4,"B":10,"C":16}""") + .option("redefine_segment_id_map:0", "SEG1 => A") + .option("redefine_segment_id_map:1", "SEG2 => B") + .option("redefine_segment_id_map:2", "SEG3 => C") + .option("segment-children:1", "SEG1 => SEG2,SEG3") + + .option("strict_integral_precision", "true") + .load(tempFile) + + val actualSchema = df.schema.treeString + val actualData = df.orderBy("SEG_ID").toJSON.collect().mkString(",") + + compareText(actualSchema, expectedSchema) + assert(actualData == expectedData) + } + } + } +} diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala index b8bdb3fb..7c7712ae 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala @@ -52,7 +52,7 @@ class Test02TextFilesOldSchool extends AnyFunSuite with SparkTestBase with Binar withTempTextFile("text_ascii", ".txt", StandardCharsets.UTF_8, textFileContent) { tmpFileName => val parsedCopybook = CopybookParser.parse(copybook, dataEncoding = ASCII, stringTrimmingPolicy = StringTrimmingPolicy.TrimNone) - val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, "", false) + val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, false, "", false) val sparkSchema = cobolSchema.getSparkSchema val rddText = spark.sparkContext