Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Json configuration flag to allow comments #2592

Merged
merged 8 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2017-2024 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license.
*/

package kotlinx.benchmarks.json

import kotlinx.benchmarks.model.*
import kotlinx.serialization.json.*
import org.openjdk.jmh.annotations.*
import java.io.*
import java.util.concurrent.*

@Warmup(iterations = 7, time = 1)
@Measurement(iterations = 7, time = 1)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Benchmark)
@Fork(2)
open class TwitterFeedCommentsBenchmark {
val inputBytes = TwitterFeedBenchmark::class.java.getResource("/twitter_macro.json").readBytes()
private val input = inputBytes.decodeToString()
private val inputWithComments = prepareInputWithComments(input)
private val inputWithCommentsBytes = inputWithComments.encodeToByteArray()

private val jsonComments = Json { ignoreUnknownKeys = true; allowComments = true; }
private val jsonNoComments = Json { ignoreUnknownKeys = true; allowComments = false; }

fun prepareInputWithComments(inp: String): String {
val result = inp.lineSequence().map { s ->
// "id", "in_...", "is_...", etc
if (!s.trimStart().startsWith("\"i")) s else "$s // json comment"
}.joinToString("\n")
assert(result.contains("// json comment"))
return result
}

@Setup
fun init() {
// Explicitly invoking both variants before benchmarking so we know that both parser implementation classes are loaded
require("foobar" == jsonComments.decodeFromString<String>("\"foobar\""))
require("foobar" == jsonNoComments.decodeFromString<String>("\"foobar\""))
}

// The difference with TwitterFeedBenchmark.decodeMicroTwitter shows if we slow down when both StringJsonLexer and CommentsJsonLexer
// are loaded by JVM. Should be almost non-existent on modern JVMs (but on OpenJDK-Corretto-11.0.14.1 there is one. 17 is fine.)
@Benchmark
fun decodeMicroTwitter() = jsonNoComments.decodeFromString(MicroTwitterFeed.serializer(), input)

// The difference with this.decodeMicroTwitter shows if we slow down when comments are enabled but no comments present
// in the input. It is around 13% slower than without comments support, mainly because skipWhitespaces is a separate function
// that sometimes is not inlined by JIT.
@Benchmark
fun decodeMicroTwitterCommentSupport() = jsonComments.decodeFromString(MicroTwitterFeed.serializer(), input)

// Shows how much actual skipping of the comments takes: around 10%.
@Benchmark
fun decodeMicroTwitterCommentInData() = jsonComments.decodeFromString(MicroTwitterFeed.serializer(), inputWithComments)

@Benchmark
fun decodeMicroTwitterCommentSupportStream() = jsonComments.decodeFromStream(MicroTwitterFeed.serializer(), ByteArrayInputStream(inputBytes))

@Benchmark
fun decodeMicroTwitterCommentInDataStream() = jsonComments.decodeFromStream(MicroTwitterFeed.serializer(), ByteArrayInputStream(inputWithCommentsBytes))
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ open class TwitterFeedStreamBenchmark {
jacksonObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)


@Setup
fun init() {
// Explicitly invoking decodeFromStream before benchmarking so we know that both parser implementation classes are loaded
require("foobar" == Json.decodeFromStream<String>(ByteArrayInputStream("\"foobar\"".encodeToByteArray())))
require("foobar" == Json.decodeFromString<String>("\"foobar\""))
}


private val inputStream: InputStream
get() = ByteArrayInputStream(bytes)
private val outputStream: OutputStream
Expand Down Expand Up @@ -59,6 +67,7 @@ open class TwitterFeedStreamBenchmark {
}
}

// Difference with TwitterFeedBenchmark.decodeMicroTwitter shows how heavy Java's standard UTF-8 decoding actually is.
@Benchmark
fun decodeMicroTwitterReadText(): MicroTwitterFeed {
return inputStream.use {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Copyright 2017-2024 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license.
*/

package kotlinx.serialization.features

import kotlinx.serialization.*
import kotlinx.serialization.json.*
import kotlin.test.*

class JsonCommentsTest: JsonTestBase() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need special tests for StringJsonLexer and ReaderJsonLexer. We now have two decoding branches that can be modified independently of each other.

Since JSON without comments is actually a subset of JSON with comments, it is necessary to test it for both cases.

We can do this by running all the JSON tests for allowComments = true, however, this will be very redundant.
It is easier to identify the main decoding cases and write a limited set of tests for them, which works the same regardless of the source (String or Reader) and the settings (with or without comments)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm afraid we do not have particular 'main decoding cases' tests. The closes place I can think of is Json spec tests:

Do you think it is enough to add allowComments = true there?

Also, 'JsonCommentsTest' may be already enough because it has arrays, nested objects, and strings, and there's really not much anything else in the json spec.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think SpecConformanceTest will be enough.
After all, this flag only changes parsing, but not the serialization logic.

val json = Json(default) {
allowComments = true
allowTrailingComma = true
}

val withLenient = Json(json) {
isLenient = true
ignoreUnknownKeys = true
}

@Test
fun testBasic() = parametrizedTest { mode ->
val inputBlock = """{"data": "b" /*value b*/ }"""
val inputLine = "{\"data\": \"b\" // value b \n }"
assertEquals(StringData("b"), json.decodeFromString(inputBlock, mode))
assertEquals(StringData("b"), json.decodeFromString(inputLine, mode))
}

@Serializable
data class Target(val key: String, val key2: List<Int>, val key3: NestedTarget, val key4: String)

@Serializable
data class NestedTarget(val nestedKey: String)

private fun target(key4: String): Target = Target("value", listOf(1, 2), NestedTarget("foo"), key4)

@Test
fun testAllBlocks() = parametrizedTest { mode ->
val input = """{ /*beginning*/
/*before key*/ "key" /*after key*/ : /*after colon*/ "value" /*before comma*/,
"key2": [ /*array1*/ 1, /*array2*/ 2, /*end array*/],
"key3": { /*nested obj*/ "nestedKey": "foo"} /*after nested*/,
"key4": "/*comment inside quotes is a part of value*/",
/*before end*/
}"""
assertEquals(target("/*comment inside quotes is a part of value*/"), json.decodeFromString(input, mode))
}

@Test
fun testAllLines() = parametrizedTest { mode ->
val input = """{ //beginning
//before key
"key" // after key
: // after colon
"value" //before comma
,
"key2": [ //array1
1, //array2
2, //end array
],
"key3": { //nested obj
"nestedKey": "foo"
} , //after nested
"key4": "//comment inside quotes is a part of value",
//before end
}"""
assertEquals(target("//comment inside quotes is a part of value"), json.decodeFromString(input, mode))
}

@Test
fun testMixed() = parametrizedTest { mode ->
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
val input = """{ // begin
"key": "value", // after
"key2": /* array */ /*another comment */ [1, 2],
"key3": /* //this is a block comment */ { "nestedKey": // /*this is a line comment*/ "bar"
"foo" },
"key4": /* nesting block comments /* not supported */ "*/"
/* end */}"""
assertEquals(target("*/"), json.decodeFromString(input, mode))
}

@Test
fun testWeirdKeys() {
val map = mapOf(
"// comment inside quotes is a part of key" to "/* comment inside quotes is a part of value */",
"/*key */" to "/* value",
"/* key" to "*/ value"
)
val input = """/* before begin */
{
${map.entries.joinToString(separator = ",\n") { (k, v) -> "\"$k\" : \"$v\"" }}
} // after end
""".trimIndent()
val afterMap = json.parseToJsonElement(input).jsonObject.mapValues { (_, v) ->
v as JsonPrimitive
assertTrue(v.isString)
v.content
}
assertEquals(map, afterMap)
}

@Test
fun testWithLenient() = parametrizedTest { mode ->
val input = """{ //beginning
//before key
key // after key
: // after colon
value //before comma
,
key2: [ //array1
1, //array2
2, //end array
],
key3: { //nested obj
nestedKey: "foo"
} , //after nested
key4: value//comment_cannot_break_value_apart,
key5: //comment without quotes where new token expected is still a comment
value5,
//before end
}"""
assertEquals(target("value//comment_cannot_break_value_apart"), withLenient.decodeFromString(input, mode))
}

@Test
fun testUnclosedCommentsErrorMsg() = parametrizedTest { mode ->
val input = """{"data": "x"} // no newline"""
assertEquals(StringData("x"), json.decodeFromString<StringData>(input, mode))
val input2 = """{"data": "x"} /* no endblock"""
assertFailsWith<SerializationException>("Expected end of the block comment: \"*/\", but had EOF instead at path: \$") {
json.decodeFromString<StringData>(input2, mode)
}
}

private val lexerBatchSize = 16 * 1024

@Test
fun testVeryLargeComments() = parametrizedTest { mode ->
val strLen = lexerBatchSize * 2 + 42
val inputLine = """{"data": //a""" + "a".repeat(strLen) + "\n\"x\"}"
assertEquals(StringData("x"), json.decodeFromString<StringData>(inputLine, mode))
val inputBlock = """{"data": /*a""" + "a".repeat(strLen) + "*/\"x\"}"
assertEquals(StringData("x"), json.decodeFromString<StringData>(inputBlock, mode))
}

@Test
fun testCommentsOnThresholdEdge() = parametrizedTest { mode ->
val inputPrefix = """{"data": /*a"""
// Here, we test the situation when closing */ is divided in buffer:
// * fits in the initial buffer, but / is not.
// E.g. situation with batches looks like this: ['{', '"', 'd', ..., '*'], ['/', ...]
val bloatSize = lexerBatchSize - inputPrefix.length - 1
val inputLine = inputPrefix + "a".repeat(bloatSize) + "*/\"x\"}"
assertEquals(StringData("x"), json.decodeFromString<StringData>(inputLine, mode))

// Test when * is unclosed and last in buffer:
val inputLine2 = inputPrefix + "a".repeat(bloatSize) + "*"
assertFailsWith<SerializationException>("Expected end of the block comment: \"*/\", but had EOF instead at path: \$") {
json.decodeFromString<StringData>(inputLine2, mode)
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,4 @@ y_structure_lonely_true.json
y_structure_string_empty.json
y_structure_trailing_newline.json
y_structure_true_in_array.json
y_structure_whitespace_array.json
y_structure_whitespace_array.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Non-spec inputs that we accept with allowComments = true setting:
n_object_trailing_comment.json
n_object_trailing_comment_slash_open.json
n_structure_object_with_comment.json
Binary file not shown.
3 changes: 3 additions & 0 deletions formats/json/api/kotlinx-serialization-json.api
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ public final class kotlinx/serialization/json/JsonArraySerializer : kotlinx/seri
}

public final class kotlinx/serialization/json/JsonBuilder {
public final fun getAllowComments ()Z
public final fun getAllowSpecialFloatingPointValues ()Z
public final fun getAllowStructuredMapKeys ()Z
public final fun getAllowTrailingComma ()Z
Expand All @@ -111,6 +112,7 @@ public final class kotlinx/serialization/json/JsonBuilder {
public final fun getUseAlternativeNames ()Z
public final fun getUseArrayPolymorphism ()Z
public final fun isLenient ()Z
public final fun setAllowComments (Z)V
public final fun setAllowSpecialFloatingPointValues (Z)V
public final fun setAllowStructuredMapKeys (Z)V
public final fun setAllowTrailingComma (Z)V
Expand Down Expand Up @@ -141,6 +143,7 @@ public synthetic class kotlinx/serialization/json/JsonClassDiscriminator$Impl :

public final class kotlinx/serialization/json/JsonConfiguration {
public fun <init> ()V
public final fun getAllowComments ()Z
public final fun getAllowSpecialFloatingPointValues ()Z
public final fun getAllowStructuredMapKeys ()Z
public final fun getAllowTrailingComma ()Z
Expand Down
21 changes: 19 additions & 2 deletions formats/json/commonMain/src/kotlinx/serialization/json/Json.kt
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,13 @@ public sealed class Json(
* @throws [IllegalArgumentException] if the decoded input cannot be represented as a valid instance of type [T]
*/
public final override fun <T> decodeFromString(deserializer: DeserializationStrategy<T>, @FormatLanguage("json", "", "") string: String): T {
val lexer = StringJsonLexer(string)
val lexer = StringJsonLexer(this, string)
val input = StreamingJsonDecoder(this, WriteMode.OBJ, lexer, deserializer.descriptor, null)
val result = input.decodeSerializableValue(deserializer)
lexer.expectEof()
return result
}

/**
* Serializes the given [value] into an equivalent [JsonElement] using the given [serializer]
*
Expand Down Expand Up @@ -385,6 +386,22 @@ public class JsonBuilder internal constructor(json: Json) {
@ExperimentalSerializationApi
public var allowTrailingComma: Boolean = json.configuration.allowTrailingComma

/**
* Allows parser to accept C/Java-style comments in JSON input.
*
* Comments are being skipped and are not stored anywhere; this setting does not affect encoding in any way.
*
* More specifically, a comment is a substring that is not a part of JSON key or value, conforming to one of those:
*
* 1. Starts with `//` characters and ends with a newline character `\n`.
* 2. Starts with `/*` characters and ends with `*/` characters. Nesting block comments
* is not supported: no matter how many `/*` characters you have, first `*/` will end the comment.
*
* `false` by default.
*/
@ExperimentalSerializationApi
public var allowComments: Boolean = json.configuration.allowComments

/**
* Module with contextual and polymorphic serializers to be used in the resulting [Json] instance.
*
Expand Down Expand Up @@ -422,7 +439,7 @@ public class JsonBuilder internal constructor(json: Json) {
allowStructuredMapKeys, prettyPrint, explicitNulls, prettyPrintIndent,
coerceInputValues, useArrayPolymorphism,
classDiscriminator, allowSpecialFloatingPointValues, useAlternativeNames,
namingStrategy, decodeEnumsCaseInsensitive, allowTrailingComma, classDiscriminatorMode
namingStrategy, decodeEnumsCaseInsensitive, allowTrailingComma, allowComments, classDiscriminatorMode
)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public class JsonConfiguration @OptIn(ExperimentalSerializationApi::class) inter
@ExperimentalSerializationApi
public val allowTrailingComma: Boolean = false,
@ExperimentalSerializationApi
public val allowComments: Boolean = false,
@ExperimentalSerializationApi
public var classDiscriminatorMode: ClassDiscriminatorMode = ClassDiscriminatorMode.POLYMORPHIC,
) {

Expand All @@ -49,7 +51,7 @@ public class JsonConfiguration @OptIn(ExperimentalSerializationApi::class) inter
"prettyPrintIndent='$prettyPrintIndent', coerceInputValues=$coerceInputValues, useArrayPolymorphism=$useArrayPolymorphism, " +
"classDiscriminator='$classDiscriminator', allowSpecialFloatingPointValues=$allowSpecialFloatingPointValues, " +
"useAlternativeNames=$useAlternativeNames, namingStrategy=$namingStrategy, decodeEnumsCaseInsensitive=$decodeEnumsCaseInsensitive, " +
"allowTrailingComma=$allowTrailingComma, classDiscriminatorMode=$classDiscriminatorMode)"
"allowTrailingComma=$allowTrailingComma, allowComments=$allowComments, classDiscriminatorMode=$classDiscriminatorMode)"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public fun <T> decodeByReader(
deserializer: DeserializationStrategy<T>,
reader: InternalJsonReader
): T {
val lexer = ReaderJsonLexer(reader)
val lexer = ReaderJsonLexer(json, reader)
try {
val input = StreamingJsonDecoder(json, WriteMode.OBJ, lexer, deserializer.descriptor, null)
val result = input.decodeSerializableValue(deserializer)
Expand All @@ -56,7 +56,7 @@ public fun <T> decodeToSequenceByReader(
deserializer: DeserializationStrategy<T>,
format: DecodeSequenceMode = DecodeSequenceMode.AUTO_DETECT
): Sequence<T> {
val lexer = ReaderJsonLexer(reader, CharArray(BATCH_SIZE)) // Unpooled buffer due to lazy nature of sequence
val lexer = ReaderJsonLexer(json, reader, CharArray(BATCH_SIZE)) // Unpooled buffer due to lazy nature of sequence
val iter = JsonIterator(format, json, lexer, deserializer)
return Sequence { iter }.constrainOnce()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ public fun <T> decodeStringToJsonTree(
deserializer: DeserializationStrategy<T>,
source: String
): JsonElement {
val lexer = StringJsonLexer(source)
val lexer = StringJsonLexer(json, source)
val input = StreamingJsonDecoder(json, WriteMode.OBJ, lexer, deserializer.descriptor, null)
val tree = input.decodeJsonElement()
lexer.expectEof()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,12 @@ private sealed class AbstractJsonTreeDecoder(
return this as? JsonLiteral ?: throw JsonDecodingException(-1, "Unexpected 'null' literal when non-nullable $type was expected")
}

override fun decodeTaggedInline(tag: String, inlineDescriptor: SerialDescriptor): Decoder =
if (inlineDescriptor.isUnsignedNumber) JsonDecoderForUnsignedTypes(StringJsonLexer(getPrimitiveValue(tag).content), json)
else super.decodeTaggedInline(tag, inlineDescriptor)
override fun decodeTaggedInline(tag: String, inlineDescriptor: SerialDescriptor): Decoder {
return if (inlineDescriptor.isUnsignedNumber) {
val lexer = StringJsonLexer(json, getPrimitiveValue(tag).content)
JsonDecoderForUnsignedTypes(lexer, json)
} else super.decodeTaggedInline(tag, inlineDescriptor)
}

override fun decodeInline(descriptor: SerialDescriptor): Decoder {
return if (currentTagOrNull != null) super.decodeInline(descriptor)
Expand Down
Loading