Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explicit Text Support in Serialization #32277

Merged
merged 15 commits into from
Jan 10, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@
package com.azure.core.implementation;

import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Arrays;

/**
* This class is an extension of {@link ByteArrayOutputStream} which allows access to the backing {@code byte[]} without
* requiring a copying of the data. The only use of this class is for internal purposes where we know it is safe to
* directly access the {@code byte[]} without copying.
* <p>
* This class isn't meant to be thread-safe as usage should be internal to azure-core and should be guarded
* appropriately when used.
*/
public class AccessibleByteArrayOutputStream extends ByteArrayOutputStream {
/**
Expand All @@ -30,10 +35,39 @@ public AccessibleByteArrayOutputStream(int initialCapacity) {
}

@Override
public synchronized byte[] toByteArray() {
public byte[] toByteArray() {
return Arrays.copyOf(buf, count);
}

/**
* Returns the internal {@code byte[]} without copying.
* <p>
* This will be the full {@code byte[]}, so if writing required it to be resized to 8192 bytes but only 6000 bytes
* were written the final 2192 bytes will be undefined data. If this is used in an API where a {@code byte[]} is
* accepted you must use the range based overload with {@link #count()}, if a range based overload isn't available
* use {@link #toByteArray()} which will copy the range of bytes written.
*
* @return A direct reference to the internal {@code byte[]} where data is being written.
*/
public byte[] toByteArrayUnsafe() {
alzimmermsft marked this conversation as resolved.
Show resolved Hide resolved
return buf;
}

/**
* Returns a {@link ByteBuffer} representation of the content written to this stream.
* <p>
* The {@link ByteBuffer} will use a direct reference to the internal {@code byte[]} being written, so any
* modifications to the content already written will be reflected in the {@link ByteBuffer}. Given the direct
* reference to the internal {@code byte[]} the {@link ByteBuffer} returned by the API will be read-only. Further
* writing to this stream won't be reflected in the {@link ByteBuffer} as the ByteBuffer will be created using
* {@code ByteBuffer.wrap(bytes, 0, count())}.
*
* @return A read-only {@link ByteBuffer} represented by the internal buffer being written to.
*/
public ByteBuffer toByteBuffer() {
return ByteBuffer.wrap(buf, 0, count).asReadOnlyBuffer();
}

/**
* The number of bytes that have been written to the stream.
*
Expand All @@ -52,4 +86,17 @@ public int count() {
public String toString(Charset charset) {
return new String(buf, 0, count, charset);
}

/**
* Gets a BOM aware string representation of the stream.
* <p>
* This method is the equivalent of calling
* {@code ImplUtils.bomAwareToString(toByteBufferUnsafe(), 0, count(), contentType)}.
*
* @param contentType The {@code Content-Type} header value.
* @return A string representation of the stream encoded to the found encoding.
*/
public String bomAwareToString(String contentType) {
return ImplUtils.bomAwareToString(buf, 0, count, contentType);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import java.io.OutputStream;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.time.DateTimeException;
import java.time.Duration;
import java.time.OffsetDateTime;
Expand All @@ -26,6 +30,8 @@
import java.util.NoSuchElementException;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Utility class containing implementation specific methods.
Expand All @@ -37,6 +43,16 @@ public final class ImplUtils {
// future improvement - make this configurable
public static final int MAX_CACHE_SIZE = 10000;

private static final Charset UTF_32BE = Charset.forName("UTF-32BE");
private static final Charset UTF_32LE = Charset.forName("UTF-32LE");
private static final byte ZERO = (byte) 0x00;
private static final byte BB = (byte) 0xBB;
private static final byte BF = (byte) 0xBF;
private static final byte EF = (byte) 0xEF;
private static final byte FE = (byte) 0xFE;
private static final byte FF = (byte) 0xFF;
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=(\\S+)\\b", Pattern.CASE_INSENSITIVE);

/**
* Attempts to extract a retry after duration from a given set of {@link HttpHeaders}.
* <p>
Expand Down Expand Up @@ -247,6 +263,61 @@ public Map.Entry<String, String> next() {
}
}

/**
* Attempts to convert a byte stream into the properly encoded String.
* <p>
* This utility method will attempt to find the encoding for the String in this order.
* <ol>
* <li>Find the byte order mark in the byte array.</li>
* <li>Find the charset in the {@code contentType} header.</li>
* <li>Default to {@code UTF-8}.</li>
* </ol>
*
* @param bytes The byte array.
* @param offset The starting offset in the byte array.
* @param count The number of bytes to process in the byte array.
* @param contentType The {@code Content-Type} header value.
* @return A string representation of the byte encoded to the found encoding, or null if {@code bytes} is null.
*/
public static String bomAwareToString(byte[] bytes, int offset, int count, String contentType) {
if (bytes == null) {
return null;
}

if (count >= 3 && bytes[offset] == EF && bytes[offset + 1] == BB && bytes[offset + 2] == BF) {
return new String(bytes, 3, bytes.length - 3, StandardCharsets.UTF_8);
} else if (count >= 4 && bytes[offset] == ZERO && bytes[offset + 1] == ZERO
&& bytes[offset + 2] == FE && bytes[offset + 3] == FF) {
return new String(bytes, 4, bytes.length - 4, UTF_32BE);
} else if (count >= 4 && bytes[offset] == FF && bytes[offset + 1] == FE
&& bytes[offset + 2] == ZERO && bytes[offset + 3] == ZERO) {
return new String(bytes, 4, bytes.length - 4, UTF_32LE);
} else if (count >= 2 && bytes[offset] == FE && bytes[offset + 1] == FF) {
return new String(bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
} else if (count >= 2 && bytes[offset] == FF && bytes[offset + 1] == FE) {
return new String(bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
} else {
/*
* Attempt to retrieve the default charset from the 'Content-Encoding' header, if the value isn't
* present or invalid fallback to 'UTF-8' for the default charset.
*/
if (!CoreUtils.isNullOrEmpty(contentType)) {
try {
Matcher charsetMatcher = CHARSET_PATTERN.matcher(contentType);
if (charsetMatcher.find()) {
return new String(bytes, offset, count, Charset.forName(charsetMatcher.group(1)));
} else {
return new String(bytes, offset, count, StandardCharsets.UTF_8);
}
} catch (IllegalCharsetNameException | UnsupportedCharsetException ex) {
return new String(bytes, offset, count, StandardCharsets.UTF_8);
}
} else {
return new String(bytes, offset, count, StandardCharsets.UTF_8);
}
}
}

private ImplUtils() {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ static ByteBuffer serializeAsJsonSerializable(Object jsonSerializable) throws IO
JSON_WRITER_WRITE_JSON_SERIALIZABLE.writeJson(jsonWriter, jsonSerializable);
JSON_WRITER_FLUSH.flush(jsonWriter);

return ByteBuffer.wrap(outputStream.toByteArray(), 0, outputStream.count());
return outputStream.toByteBuffer();
}
}

Expand Down Expand Up @@ -289,7 +289,7 @@ static ByteBuffer serializeAsXmlSerializable(Object bodyContent) throws IOExcept
XML_WRITER_WRITE_XML_SERIALIZABLE.writeXml(xmlWriter, bodyContent);
XML_WRITER_FLUSH.flush(xmlWriter);

return ByteBuffer.wrap(outputStream.toByteArray(), 0, outputStream.count());
return outputStream.toByteBuffer();
} catch (IOException ex) {
throw ex;
} catch (Exception ex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@
import com.azure.core.http.HttpHeaders;
import com.azure.core.http.policy.HttpLogOptions;
import com.azure.core.http.rest.PagedResponse;
import com.azure.core.implementation.ImplUtils;
import com.azure.core.util.logging.ClientLogger;
import org.reactivestreams.Publisher;
import reactor.core.publisher.Flux;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Collection;
Expand All @@ -27,8 +24,6 @@
import java.util.Properties;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
Expand All @@ -37,15 +32,6 @@
public final class CoreUtils {
// CoreUtils is a commonly used utility, use a static logger.
private static final ClientLogger LOGGER = new ClientLogger(CoreUtils.class);
private static final Charset UTF_32BE = Charset.forName("UTF-32BE");
private static final Charset UTF_32LE = Charset.forName("UTF-32LE");
private static final byte ZERO = (byte) 0x00;
private static final byte BB = (byte) 0xBB;
private static final byte BF = (byte) 0xBF;
private static final byte EF = (byte) 0xEF;
private static final byte FE = (byte) 0xFE;
private static final byte FF = (byte) 0xFF;
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([\\S]+)\\b", Pattern.CASE_INSENSITIVE);

private CoreUtils() {
// Exists only to defeat instantiation.
Expand Down Expand Up @@ -235,36 +221,7 @@ public static String bomAwareToString(byte[] bytes, String contentType) {
return null;
}

if (bytes.length >= 3 && bytes[0] == EF && bytes[1] == BB && bytes[2] == BF) {
return new String(bytes, 3, bytes.length - 3, StandardCharsets.UTF_8);
} else if (bytes.length >= 4 && bytes[0] == ZERO && bytes[1] == ZERO && bytes[2] == FE && bytes[3] == FF) {
return new String(bytes, 4, bytes.length - 4, UTF_32BE);
} else if (bytes.length >= 4 && bytes[0] == FF && bytes[1] == FE && bytes[2] == ZERO && bytes[3] == ZERO) {
return new String(bytes, 4, bytes.length - 4, UTF_32LE);
} else if (bytes.length >= 2 && bytes[0] == FE && bytes[1] == FF) {
return new String(bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
} else if (bytes.length >= 2 && bytes[0] == FF && bytes[1] == FE) {
return new String(bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
} else {
/*
* Attempt to retrieve the default charset from the 'Content-Encoding' header, if the value isn't
* present or invalid fallback to 'UTF-8' for the default charset.
*/
if (!isNullOrEmpty(contentType)) {
try {
Matcher charsetMatcher = CHARSET_PATTERN.matcher(contentType);
if (charsetMatcher.find()) {
return new String(bytes, Charset.forName(charsetMatcher.group(1)));
} else {
return new String(bytes, StandardCharsets.UTF_8);
}
} catch (IllegalCharsetNameException | UnsupportedCharsetException ex) {
return new String(bytes, StandardCharsets.UTF_8);
}
} else {
return new String(bytes, StandardCharsets.UTF_8);
}
}
return ImplUtils.bomAwareToString(bytes, 0, bytes.length, contentType);
}

/**
Expand Down
Loading