Skip to content

Commit

Permalink
#678 Implement strict integral precision feature.
Browse files Browse the repository at this point in the history
  • Loading branch information
yruslan committed Jul 12, 2024
1 parent 6d1c729 commit 8b5e298
Show file tree
Hide file tree
Showing 19 changed files with 419 additions and 161 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1517,6 +1517,7 @@ The output looks like this:
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
| .option("strict_sign_overpunching", "true") | If `true` (default), sign overpunching will only be allowed for signed numbers. If `false`, overpunched positive sign will be allowed for unsigned numbers, but negative sign will result in null. |
| .option("improved_null_detection", "true") | If `true`(default), values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |
| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. |
| .option("binary_as_hex", "false") | By default fields that have `PIC X` and `USAGE COMP` are converted to `binary` Spark data type. If this option is set to `true`, such fields will be strings in HEX encoding. |

##### Modifier options
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,14 @@ object CopybookParser extends Logging {
dropGroupFillers: Boolean = false,
dropValueFillers: Boolean = true,
commentPolicy: CommentPolicy = CommentPolicy(),
dropFillersFromAst: Boolean = false
dropFillersFromAst: Boolean = false,
strictIntegralPrecision: Boolean = false
): Copybook = {
val copybook = parse(copyBookContents = copyBookContents,
dropGroupFillers = dropGroupFillers,
dropValueFillers = dropValueFillers,
commentPolicy = commentPolicy)
commentPolicy = commentPolicy,
strictIntegralPrecision = strictIntegralPrecision)

if (dropFillersFromAst && (dropGroupFillers || dropValueFillers)) {
copybook.dropFillers(dropGroupFillers, dropValueFillers)
Expand Down Expand Up @@ -135,6 +137,7 @@ object CopybookParser extends Logging {
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
strictIntegralPrecision: Boolean = false,
decodeBinaryAsHex: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
Expand All @@ -155,6 +158,7 @@ object CopybookParser extends Logging {
commentPolicy,
strictSignOverpunch,
improvedNullDetection,
strictIntegralPrecision,
decodeBinaryAsHex,
ebcdicCodePage,
asciiCharset,
Expand Down Expand Up @@ -197,6 +201,7 @@ object CopybookParser extends Logging {
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
strictIntegralPrecision: Boolean = false,
decodeBinaryAsHex: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
Expand All @@ -217,6 +222,7 @@ object CopybookParser extends Logging {
commentPolicy,
strictSignOverpunch,
improvedNullDetection,
strictIntegralPrecision,
decodeBinaryAsHex,
ebcdicCodePage,
asciiCharset,
Expand Down Expand Up @@ -262,6 +268,7 @@ object CopybookParser extends Logging {
commentPolicy: CommentPolicy,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
strictIntegralPrecision: Boolean,
decodeBinaryAsHex: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
Expand All @@ -272,7 +279,7 @@ object CopybookParser extends Logging {
debugFieldsPolicy: DebugFieldsPolicy,
fieldCodePageMap: Map[String, String]): Copybook = {

val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)

val nonTerms: Set[String] = (for (id <- nonTerminals)
yield transformIdentifier(id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ object ANTLRParser extends Logging {
commentPolicy: CommentPolicy,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
strictIntegralPrecision: Boolean,
decodeBinaryAsHex: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat,
fieldCodePageMap: Map[String, String]): CopybookAST = {
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, fieldCodePageMap)
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)

val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
line =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class ParserVisitor(enc: Encoding,
floatingPointFormat: FloatingPointFormat,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
strictIntegralPrecision: Boolean,
decodeBinaryAsHex: Boolean,
fieldCodePageMap: Map[String, String]) extends copybookParserBaseVisitor[Expr] {
/* expressions */
Expand Down Expand Up @@ -853,7 +854,7 @@ class ParserVisitor(enc: Encoding,
Map(),
isDependee = false,
identifier.toUpperCase() == Constants.FILLER,
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
) (Some(parent))

parent.children.append(prim)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@ object DecoderSelector {
* <li> Integral types are represented as boxed integers and longs. Larger integral numbers are represented as BigDecimal </li>
* </ul>
*
* @param dataType A daatype of a copybook field
* @param stringTrimmingPolicy Specifies how the decoder should handle string types
* @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat Specifies a floating point format (IBM or IEEE754)
* @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers.
* @param dataType A datatype of a copybook field
* @param stringTrimmingPolicy Specifies how the decoder should handle string types
* @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat Specifies a floating point format (IBM or IEEE754)
* @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers.
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @return A function that converts an array of bytes to the target data type.
*/
def getDecoder(dataType: CobolType,
Expand All @@ -58,11 +60,12 @@ object DecoderSelector {
isUtf16BigEndian: Boolean = true,
floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM,
strictSignOverpunch: Boolean = false,
improvedNullDetection: Boolean = false): Decoder = {
improvedNullDetection: Boolean = false,
strictIntegralPrecision: Boolean = false): Decoder = {
val decoder = dataType match {
case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection)
case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection)
case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
case _ => throw new IllegalStateException("Unknown AST object")
}
decoder
Expand Down Expand Up @@ -186,7 +189,8 @@ object DecoderSelector {
/** Gets a decoder function for an integral data type. A direct conversion from array of bytes to the target type is used where possible. */
private def getIntegralDecoder(integralType: Integral,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean): Decoder = {
improvedNullDetection: Boolean,
strictIntegralPrecision: Boolean): Decoder = {
val encoding = integralType.enc.getOrElse(EBCDIC)

val isEbcidic = encoding match {
Expand All @@ -198,7 +202,12 @@ object DecoderSelector {

integralType.compact match {
case None =>
if (integralType.precision <= Constants.maxIntegerPrecision) {
if (strictIntegralPrecision) {
if (isEbcidic)
StringDecoders.decodeEbcdicBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
else
StringDecoders.decodeAsciiBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
} else if (integralType.precision <= Constants.maxIntegerPrecision) {
if (isEbcidic)
StringDecoders.decodeEbcdicInt(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
else
Expand All @@ -223,57 +232,60 @@ object DecoderSelector {
throw new IllegalStateException("Unexpected error. COMP-2 (double) is incorrect for an integral number.")
case Some(COMP3()) =>
// COMP-3 aka BCD-encoded number
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true)
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true, strictIntegralPrecision)
case Some(COMP3U()) =>
// COMP-3U aka Unsigned BCD-encoded number aka Unsigned Packed
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false)
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false, strictIntegralPrecision)
case Some(COMP4()) =>
// COMP aka BINARY encoded number
getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true)
getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision)
case Some(COMP5()) =>
// COMP aka BINARY encoded number
getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true)
getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision)
case Some(COMP9()) =>
// COMP aka BINARY encoded number
getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false)
getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false, strictIntegralPrecision)
case _ =>
throw new IllegalStateException(s"Unknown number compression format (${integralType.compact.get}).")
}
}

/** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean): Decoder = {
private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = {
val isSigned = signPosition.nonEmpty
val isSignLeft = signPosition.forall(sp => if (sp == za.co.absa.cobrix.cobol.parser.position.Left) true else false)

val numOfBytes = BinaryUtils.getBytesCount(compact, precision, isSigned, isExplicitDecimalPt = false, isSignSeparate = false)
val decoder = (isSigned, isBigEndian, numOfBytes) match {
case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
case _ =>
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
val decoder = if (strictIntegralPrecision) {
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
} else {
(isSigned, isBigEndian, numOfBytes) match {
case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
case _ =>
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
}
}
decoder // 999 999 999
}

/** Gets a decoder function for a BCD-encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean): Decoder = {
private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean, strictIntegralPrecision: Boolean): Decoder = {
val decoder =
if (precision <= Constants.maxIntegerPrecision) {
if (precision <= Constants.maxIntegerPrecision && !strictIntegralPrecision) {
a: Array[Byte] => {
val num = BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble)
if (num != null) {
Expand All @@ -282,7 +294,7 @@ object DecoderSelector {
null
}
}
} else if (precision <= Constants.maxLongPrecision) {
} else if (precision <= Constants.maxLongPrecision && !strictIntegralPrecision) {
a: Array[Byte] => BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble)
} else {
a: Array[Byte] =>
Expand Down
Loading

0 comments on commit 8b5e298

Please sign in to comment.