Skip to content

Commit

Permalink
Implement ByteSource.asCharSource(charset).read() using the decoding …
Browse files Browse the repository at this point in the history
…string constructor instead of streaming the contents into a StringBuilder.

this allows us to avoid a number of copies that are currently happening for each character (1. into a temporary CharBuffer, 2. into a StringBuilder, 3 into the String char[]) and replace it with simply whatever is required by ByteSource.read() and the String(byte[], charset) constructor.  For certain ByteSource implementations (like FileByteSource) ByteSource.read() can often presize the byte[] correctly and the string constructor can also do some things to guess at the correct array sizes and avoid copies in the common case.

Benchmarks have shown that this should improve the speed of Files.toString significantly.

further work should also be done for the normal CharSource.read implementation.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=162220754
  • Loading branch information
lukesandberg authored and cpovirk committed Jul 17, 2017
1 parent 71b5b85 commit ee12894
Show file tree
Hide file tree
Showing 4 changed files with 298 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright (C) 2017 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/

package com.google.common.io;

import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;
import com.google.caliper.api.VmOptions;
import com.google.common.base.Optional;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Random;

/**
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
*/
// These benchmarks allocate a lot of data so use a large heap
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
public class ByteSourceAsCharSourceReadBenchmark {
enum ReadStrategy {
TO_BYTE_ARRAY_NEW_STRING {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
return new String(byteSource.read(), cs);
}
},
USING_CHARSTREAMS_COPY {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
StringBuilder sb = new StringBuilder();
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
CharStreams.copy(reader, sb);
}
return sb.toString();
}
},
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't
// my best guess is that the jdk authors have spent more time optimizing that callpath than this
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least
// come close.
USING_DECODER_WITH_SIZE_HINT {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
Optional<Long> size = byteSource.sizeIfKnown();
// if we know the size and it fits in an int
if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
// otherwise try to presize a StringBuilder
// it is kind of lame that we need to construct a decoder to access this value.
// if this is a concern we could add special cases for some known charsets (like utf8)
// or we could avoid inputstreamreader and use the decoder api directly
// TODO(lukes): in a real implementation we would need to handle overflow conditions
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
char[] buffer = new char[maxChars];
int bufIndex = 0;
int remaining = buffer.length;
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
int nRead = 0;
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
bufIndex += nRead;
remaining -= nRead;
}
if (nRead == -1) {
// we reached EOF
return new String(buffer, 0, bufIndex);
}
// otherwise we got the size wrong. This can happen if the size changes between when
// we called sizeIfKnown and when we started reading the file (or i guess if
// maxCharsPerByte is wrong)
// Fallback to an incremental approach
StringBuilder builder = new StringBuilder(bufIndex + 32);
builder.append(buffer, 0, bufIndex);
buffer = null; // release for gc
CharStreams.copy(reader, builder);
return builder.toString();
}

} else {
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
}
}
};

abstract String read(ByteSource byteSource, Charset cs) throws IOException;
}

@Param({"UTF-8"})
String charsetName;

@Param ReadStrategy strategy;

@Param({"10", "1024", "1048576"})
int size;

Charset charset;
ByteSource data;

@BeforeExperiment
public void setUp() {
charset = Charset.forName(charsetName);
StringBuilder sb = new StringBuilder();
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
sb.ensureCapacity(size);
for (int k = 0; k < size; k++) {
// [9-127) includes all ascii non-control characters
sb.append((char) (random.nextInt(127 - 9) + 9));
}
String string = sb.toString();
sb.setLength(0);
data = ByteSource.wrap(string.getBytes(charset));
}

@Benchmark
public int timeCopy(int reps) throws IOException {
int r = 0;
final Charset localCharset = charset;
final ByteSource localData = data;
final ReadStrategy localStrategy = strategy;
for (int i = 0; i < reps; i++) {
r += localStrategy.read(localData, localCharset).hashCode();
}
return r;
}
}
12 changes: 12 additions & 0 deletions android/guava/src/com/google/common/io/ByteSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,18 @@ public Reader openStream() throws IOException {
return new InputStreamReader(ByteSource.this.openStream(), charset);
}

@Override
public String read() throws IOException {
// Reading all the data as a byte array is more efficient than the default read()
// implementation because:
// 1. the string constructor can avoid an extra copy most of the time by correctly sizing the
// internal char array (hard to avoid using StringBuilder)
// 2. we avoid extra copies into temporary buffers altogether
// The downside is that this will cause us to store the file bytes in memory twice for a short
// amount of time.
return new String(ByteSource.this.read(), charset);
}

@Override
public String toString() {
return ByteSource.this.toString() + ".asCharSource(" + charset + ")";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright (C) 2017 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/

package com.google.common.io;

import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;
import com.google.caliper.api.VmOptions;
import com.google.common.base.Optional;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Random;

/**
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
*/
// These benchmarks allocate a lot of data so use a large heap
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
public class ByteSourceAsCharSourceReadBenchmark {
enum ReadStrategy {
TO_BYTE_ARRAY_NEW_STRING {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
return new String(byteSource.read(), cs);
}
},
USING_CHARSTREAMS_COPY {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
StringBuilder sb = new StringBuilder();
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
CharStreams.copy(reader, sb);
}
return sb.toString();
}
},
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't
// my best guess is that the jdk authors have spent more time optimizing that callpath than this
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least
// come close.
USING_DECODER_WITH_SIZE_HINT {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
Optional<Long> size = byteSource.sizeIfKnown();
// if we know the size and it fits in an int
if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
// otherwise try to presize a StringBuilder
// it is kind of lame that we need to construct a decoder to access this value.
// if this is a concern we could add special cases for some known charsets (like utf8)
// or we could avoid inputstreamreader and use the decoder api directly
// TODO(lukes): in a real implementation we would need to handle overflow conditions
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
char[] buffer = new char[maxChars];
int bufIndex = 0;
int remaining = buffer.length;
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
int nRead = 0;
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
bufIndex += nRead;
remaining -= nRead;
}
if (nRead == -1) {
// we reached EOF
return new String(buffer, 0, bufIndex);
}
// otherwise we got the size wrong. This can happen if the size changes between when
// we called sizeIfKnown and when we started reading the file (or i guess if
// maxCharsPerByte is wrong)
// Fallback to an incremental approach
StringBuilder builder = new StringBuilder(bufIndex + 32);
builder.append(buffer, 0, bufIndex);
buffer = null; // release for gc
CharStreams.copy(reader, builder);
return builder.toString();
}

} else {
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
}
}
};

abstract String read(ByteSource byteSource, Charset cs) throws IOException;
}

@Param({"UTF-8"})
String charsetName;

@Param ReadStrategy strategy;

@Param({"10", "1024", "1048576"})
int size;

Charset charset;
ByteSource data;

@BeforeExperiment
public void setUp() {
charset = Charset.forName(charsetName);
StringBuilder sb = new StringBuilder();
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
sb.ensureCapacity(size);
for (int k = 0; k < size; k++) {
// [9-127) includes all ascii non-control characters
sb.append((char) (random.nextInt(127 - 9) + 9));
}
String string = sb.toString();
sb.setLength(0);
data = ByteSource.wrap(string.getBytes(charset));
}

@Benchmark
public int timeCopy(int reps) throws IOException {
int r = 0;
final Charset localCharset = charset;
final ByteSource localData = data;
final ReadStrategy localStrategy = strategy;
for (int i = 0; i < reps; i++) {
r += localStrategy.read(localData, localCharset).hashCode();
}
return r;
}
}
12 changes: 12 additions & 0 deletions guava/src/com/google/common/io/ByteSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,18 @@ public Reader openStream() throws IOException {
return new InputStreamReader(ByteSource.this.openStream(), charset);
}

@Override
public String read() throws IOException {
// Reading all the data as a byte array is more efficient than the default read()
// implementation because:
// 1. the string constructor can avoid an extra copy most of the time by correctly sizing the
// internal char array (hard to avoid using StringBuilder)
// 2. we avoid extra copies into temporary buffers altogether
// The downside is that this will cause us to store the file bytes in memory twice for a short
// amount of time.
return new String(ByteSource.this.read(), charset);
}

@Override
public String toString() {
return ByteSource.this.toString() + ".asCharSource(" + charset + ")";
Expand Down

0 comments on commit ee12894

Please sign in to comment.