-
Notifications
You must be signed in to change notification settings - Fork 24.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ML] Optimize source extraction for categorize_text aggregation #79099
Changes from all commits
21bae1f
3ac3eff
197f5d4
e7384d7
368d801
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
import org.elasticsearch.common.Strings; | ||
import org.elasticsearch.common.bytes.BytesArray; | ||
import org.elasticsearch.common.bytes.BytesReference; | ||
import org.elasticsearch.core.Nullable; | ||
import org.elasticsearch.core.Tuple; | ||
import org.elasticsearch.common.compress.Compressor; | ||
import org.elasticsearch.common.compress.CompressorFactory; | ||
|
@@ -27,6 +28,7 @@ | |
import org.elasticsearch.xcontent.XContentParseException; | ||
import org.elasticsearch.xcontent.XContentType; | ||
import org.elasticsearch.xcontent.XContentParser; | ||
import org.elasticsearch.xcontent.support.filtering.FilterPath; | ||
|
||
import java.io.BufferedInputStream; | ||
import java.io.IOException; | ||
|
@@ -101,6 +103,14 @@ public static Tuple<XContentType, Map<String, Object>> convertToMap(BytesReferen | |
return convertToMap(bytes, ordered, null); | ||
} | ||
|
||
/** | ||
* Exactly the same as {@link XContentHelper#convertToMap(BytesReference, boolean, XContentType, FilterPath[], FilterPath[])} but | ||
* none of the fields are filtered | ||
*/ | ||
public static Tuple<XContentType, Map<String, Object>> convertToMap(BytesReference bytes, boolean ordered, XContentType xContentType) { | ||
return convertToMap(bytes, ordered, xContentType, null, null); | ||
} | ||
|
||
/** | ||
* Converts the given bytes into a map that is optionally ordered. The provided {@link XContentType} must be non-null. | ||
* <p> | ||
|
@@ -110,8 +120,13 @@ public static Tuple<XContentType, Map<String, Object>> convertToMap(BytesReferen | |
* frequently when folks write nanosecond precision dates as a decimal | ||
* number. | ||
*/ | ||
public static Tuple<XContentType, Map<String, Object>> convertToMap(BytesReference bytes, boolean ordered, XContentType xContentType) | ||
throws ElasticsearchParseException { | ||
public static Tuple<XContentType, Map<String, Object>> convertToMap( | ||
BytesReference bytes, | ||
boolean ordered, | ||
XContentType xContentType, | ||
@Nullable FilterPath[] include, | ||
@Nullable FilterPath[] exclude | ||
) throws ElasticsearchParseException { | ||
try { | ||
final XContentType contentType; | ||
InputStream input; | ||
|
@@ -129,14 +144,16 @@ public static Tuple<XContentType, Map<String, Object>> convertToMap(BytesReferen | |
final int length = bytes.length(); | ||
contentType = xContentType != null ? xContentType : XContentFactory.xContentType(raw, offset, length); | ||
return new Tuple<>(Objects.requireNonNull(contentType), | ||
convertToMap(XContentFactory.xContent(contentType), raw, offset, length, ordered)); | ||
convertToMap(XContentFactory.xContent(contentType), raw, offset, length, ordered, include, exclude)); | ||
} else { | ||
input = bytes.streamInput(); | ||
contentType = xContentType != null ? xContentType : XContentFactory.xContentType(input); | ||
} | ||
try (InputStream stream = input) { | ||
return new Tuple<>(Objects.requireNonNull(contentType), | ||
convertToMap(XContentFactory.xContent(contentType), stream, ordered)); | ||
return new Tuple<>( | ||
Objects.requireNonNull(contentType), | ||
convertToMap(XContentFactory.xContent(contentType), stream, ordered, include, exclude) | ||
); | ||
} | ||
} catch (IOException e) { | ||
throw new ElasticsearchParseException("Failed to parse content to map", e); | ||
|
@@ -158,14 +175,35 @@ public static Map<String, Object> convertToMap(XContent xContent, String string, | |
} | ||
|
||
/** | ||
* Convert a string in some {@link XContent} format to a {@link Map}. Throws an {@link ElasticsearchParseException} if there is any | ||
* error. Note that unlike {@link #convertToMap(BytesReference, boolean)}, this doesn't automatically uncompress the input. | ||
* The same as {@link XContentHelper#convertToMap(XContent, byte[], int, int, boolean, FilterPath[], FilterPath[])} but none of the | ||
* fields are filtered. | ||
*/ | ||
public static Map<String, Object> convertToMap(XContent xContent, InputStream input, boolean ordered) | ||
throws ElasticsearchParseException { | ||
return convertToMap(xContent, input, ordered, null, null); | ||
} | ||
|
||
/** | ||
* Convert a string in some {@link XContent} format to a {@link Map}. Throws an {@link ElasticsearchParseException} if there is any | ||
* error. Note that unlike {@link #convertToMap(BytesReference, boolean)}, this doesn't automatically uncompress the input. | ||
* | ||
* Additionally, fields may be included or excluded from the parsing. | ||
*/ | ||
public static Map<String, Object> convertToMap( | ||
XContent xContent, | ||
InputStream input, | ||
boolean ordered, | ||
@Nullable FilterPath[] include, | ||
@Nullable FilterPath[] exclude | ||
) throws ElasticsearchParseException { | ||
// It is safe to use EMPTY here because this never uses namedObject | ||
try (XContentParser parser = xContent.createParser(NamedXContentRegistry.EMPTY, | ||
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, input)) { | ||
try (XContentParser parser = xContent.createParser( | ||
NamedXContentRegistry.EMPTY, | ||
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, | ||
input, | ||
include, | ||
exclude | ||
)) { | ||
return ordered ? parser.mapOrdered() : parser.map(); | ||
} catch (IOException e) { | ||
throw new ElasticsearchParseException("Failed to parse content to map", e); | ||
|
@@ -178,9 +216,35 @@ public static Map<String, Object> convertToMap(XContent xContent, InputStream in | |
*/ | ||
public static Map<String, Object> convertToMap(XContent xContent, byte[] bytes, int offset, int length, boolean ordered) | ||
throws ElasticsearchParseException { | ||
return convertToMap(xContent, bytes, offset, length, ordered, null, null); | ||
} | ||
|
||
/** | ||
* Convert a byte array in some {@link XContent} format to a {@link Map}. Throws an {@link ElasticsearchParseException} if there is any | ||
* error. Note that unlike {@link #convertToMap(BytesReference, boolean)}, this doesn't automatically uncompress the input. | ||
* | ||
* Unlike {@link XContentHelper#convertToMap(XContent, byte[], int, int, boolean)} this optionally accepts fields to include or exclude | ||
* during XContent parsing. | ||
*/ | ||
public static Map<String, Object> convertToMap( | ||
XContent xContent, | ||
byte[] bytes, | ||
int offset, | ||
int length, | ||
boolean ordered, | ||
@Nullable FilterPath[] include, | ||
@Nullable FilterPath[] exclude | ||
) throws ElasticsearchParseException { | ||
// It is safe to use EMPTY here because this never uses namedObject | ||
try (XContentParser parser = xContent.createParser(NamedXContentRegistry.EMPTY, | ||
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, bytes, offset, length)) { | ||
try (XContentParser parser = xContent.createParser( | ||
NamedXContentRegistry.EMPTY, | ||
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, | ||
bytes, | ||
offset, | ||
length, | ||
include, | ||
exclude) | ||
) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow this a lot. There has to be a better way..... I'm sure there is, but we're in a local minima of effort here. This is a fine change, but I think I need to add some refactoring here to my personal backlog. |
||
return ordered ? parser.mapOrdered() : parser.map(); | ||
} catch (IOException e) { | ||
throw new ElasticsearchParseException("Failed to parse content to map", e); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
import org.elasticsearch.common.xcontent.support.XContentMapValues; | ||
import org.elasticsearch.index.fieldvisitor.FieldsVisitor; | ||
import org.elasticsearch.search.fetch.subphase.FetchSourceContext; | ||
import org.elasticsearch.xcontent.support.filtering.FilterPath; | ||
|
||
import java.io.IOException; | ||
import java.util.Collection; | ||
|
@@ -157,6 +158,44 @@ public List<Object> extractRawValues(String path) { | |
return XContentMapValues.extractRawValues(path, source()); | ||
} | ||
|
||
/** | ||
* Returns the values associated with the path. Those are "low" level values, and it can | ||
* handle path expression where an array/list is navigated within. | ||
* | ||
* The major difference with {@link SourceLookup#extractRawValues(String)} is that this version will: | ||
* | ||
* - not cache source if it's not already parsed | ||
* - will only extract the desired values from the compressed source instead of deserializing the whole object | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If I read the source correctly it still does, it just filters them early, still the whole object is parsed. This save cycles which is great, however I think this documentation claim is not correct There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does pull the source object into memory (the compressed bytes), but it only parses out the specific included fields. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah - the |
||
* | ||
* This is useful when the caller only wants a single value from source and does not care of source is fully parsed and cached | ||
* for later use. | ||
* @param path The path from which to extract the values from source | ||
* @return The list of found values or an empty list if none are found | ||
*/ | ||
public List<Object> extractRawValuesWithoutCaching(String path) { | ||
if (source != null) { | ||
return XContentMapValues.extractRawValues(path, source); | ||
} | ||
FilterPath[] filterPaths = FilterPath.compile(Set.of(path)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are fast to compile now but I think they'll get slower. Right now they just sort of shift the lists around and do some string manipulation. I wonder if in the future we should make a
or something. That'd be a neat. It can wait. |
||
if (sourceAsBytes != null) { | ||
return XContentMapValues.extractRawValues( | ||
path, | ||
XContentHelper.convertToMap(sourceAsBytes, false, null, filterPaths, null).v2() | ||
); | ||
} | ||
try { | ||
FieldsVisitor sourceFieldVisitor = new FieldsVisitor(true); | ||
fieldReader.accept(docId, sourceFieldVisitor); | ||
BytesReference source = sourceFieldVisitor.source(); | ||
return XContentMapValues.extractRawValues( | ||
path, | ||
XContentHelper.convertToMap(source, false, null, filterPaths, null).v2() | ||
); | ||
} catch (Exception e) { | ||
throw new ElasticsearchParseException("failed to parse / load source", e); | ||
} | ||
} | ||
|
||
/** | ||
* For the provided path, return its value in the source. | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When I was last here I thought "we really should do something about all these overrides. That's still true. But I don't want to block this change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There do seem to be a ton of unnecessary overrides that are just there to allow other overrides. I didn't want to tackle that :D
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah. I'd keep them for now. I'll still grumble about them. But i'll have a think about if we can do something.