Skip to content

Commit

Permalink
#28721 include in 24.04.24 LTS
Browse files Browse the repository at this point in the history
  • Loading branch information
erickgonzalez committed Aug 22, 2024
1 parent 5f67c3b commit f0d6c64
Show file tree
Hide file tree
Showing 16 changed files with 5,888 additions and 89 deletions.
3 changes: 2 additions & 1 deletion dotCMS/hotfix_tracking.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,5 @@ This maintenance release includes the following code fixes:
40. https://github.com/dotCMS/core/pull/28781 : Extracting logic to truncate first 400 chars from prompt. #28781
41. https://github.com/dotCMS/core/issues/28719 : Write Postman Tests for Generative AI Endpoints #28719
42. https://github.com/dotCMS/core/issues/28770 : dotAI: register EmbeddingContentListener #28770
43. https://github.com/dotCMS/core/pull/28929 : Fixing issues detected by Sonar #28929
43. https://github.com/dotCMS/core/pull/28929 : Fixing issues detected by Sonar #28929
44. https://github.com/dotCMS/core/issues/28721 : Write Postman Tests for Embeddings AI Endpoints #28721
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package com.dotcms.ai.api;

import com.dotcms.ai.rest.forms.EmbeddingsForm;
import com.dotcms.ai.util.OpenAIThreadPool;
import com.dotmarketing.portlets.contentlet.model.Contentlet;

import java.util.List;

/**
* The AsyncEmbeddingsCallStrategy class is responsible for embedding contentlets in an asynchronous manner.
*
* @author vico
*/
public class AsyncEmbeddingsCallStrategy implements EmbeddingsCallStrategy {

@Override
public void bulkEmbed(final List<String> inodes, final EmbeddingsForm embeddingsForm) {
OpenAIThreadPool.submit(new BulkEmbeddingsRunner(inodes, embeddingsForm));
}

@Override
public void embed(final EmbeddingsAPIImpl embeddingsAPI,
final Contentlet contentlet,
final String content,
final String indexName) {
OpenAIThreadPool.submit(new EmbeddingsRunner(embeddingsAPI, contentlet, content, indexName));
}

}
6 changes: 3 additions & 3 deletions dotCMS/src/main/java/com/dotcms/ai/api/EmbeddingsAPIImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public int deleteByQuery(@NotNull final String deleteQuery, final Optional<Strin
}
newOffset += limit;

for(final ContentletSearch row : searchResults){
for(final ContentletSearch row : searchResults) {
final String esId = row.getId();
final Builder dto = new EmbeddingsDTO.Builder().withIdentifier(row.getIdentifier());

Expand Down Expand Up @@ -152,7 +152,7 @@ public boolean generateEmbeddingsForContent(@NotNull final Contentlet contentlet
return false;
}

OpenAIThreadPool.submit(new EmbeddingsRunner(this, contentlet, content.get(), indexName));
EmbeddingsCallStrategy.resolveStrategy().embed(this, contentlet, content.get(), indexName);

return true;
}
Expand Down Expand Up @@ -303,7 +303,7 @@ public void dropEmbeddingsTable() {
@Override
@WrapInTransaction
public void initEmbeddingsTable() {
EmbeddingsFactory.impl.get();
EmbeddingsFactory.impl.get().initVector();
}

@WrapInTransaction
Expand Down
47 changes: 47 additions & 0 deletions dotCMS/src/main/java/com/dotcms/ai/api/EmbeddingsCallStrategy.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package com.dotcms.ai.api;

import com.dotcms.ai.rest.forms.EmbeddingsForm;
import com.dotmarketing.portlets.contentlet.model.Contentlet;
import com.dotmarketing.util.ConfigUtils;

import java.util.List;

/**
* The EmbeddingsCallStrategy interface defines the contract for embedding strategies.
* Implementations of this interface will provide different ways to handle the embedding of contentlets.
*
* The embed method takes a list of inodes and an EmbeddingsForm object, and performs the embedding operation.
* The specifics of how the embedding is done depends on the implementation.
*
* @author Your Name
*/
public interface EmbeddingsCallStrategy {

/**
* Embeds contentlets based on the provided inodes and form data.
*
* @param inodes the list of inodes representing the contentlets to be embedded
* @param embeddingsForm the form data containing the details for the embedding operation
*/
void bulkEmbed(List<String> inodes, EmbeddingsForm embeddingsForm);

/**
* Embeds the content of a contentlet.
*
* @param embeddingsAPI the EmbeddingsAPIImpl instance to use
* @param contentlet the contentlet to embed
* @param content the content to embed
* @param indexName the index name to use
*/
void embed(EmbeddingsAPIImpl embeddingsAPI, Contentlet contentlet, String content, String indexName);

/**
* Resolves the appropriate embedding strategy based on the current environment.
*
* @return the EmbeddingsCallStrategy implementation to use
*/
static EmbeddingsCallStrategy resolveStrategy() {
return ConfigUtils.isDevMode() ? new SyncEmbeddingsCallStrategy() : new AsyncEmbeddingsCallStrategy();
}

}
23 changes: 9 additions & 14 deletions dotCMS/src/main/java/com/dotcms/ai/api/EmbeddingsRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ public void run() {
if (totalTokens < splitAtTokens) {
buffer.append(sentence.trim()).append(" ");
} else {
save(buffer);
saveEmbedding(buffer.toString());
buffer.setLength(0);
buffer.append(sentence.trim()).append(" ");
totalTokens = tokenCount;
}
}

if (buffer.toString().split("\\s+").length > 0) {
save(buffer);
saveEmbedding(buffer.toString());
}
} catch (Exception e) {
if (ConfigService.INSTANCE.config().getConfigBoolean(AppKeys.DEBUG_LOGGING)) {
Expand All @@ -89,18 +89,13 @@ public void run() {
}
}

private void save(StringBuilder buffer) {
saveEmbedding(buffer.toString().trim(), contentlet, indexName);
}

private void saveEmbedding(@NotNull final String content,
@NotNull final Contentlet contentlet,
final String indexName) {
if (UtilMethods.isEmpty(content)) {
private void saveEmbedding(@NotNull final String initial) {
if (UtilMethods.isEmpty(initial)) {
return;
}

if (embeddingsAPI.embeddingExists(contentlet.getInode(), indexName, content)) {
final String normalizedContent = initial.trim();
if (embeddingsAPI.embeddingExists(contentlet.getInode(), indexName, normalizedContent)) {
Logger.info(
this.getClass(),
"embedding already exists for content:"
Expand All @@ -110,9 +105,9 @@ private void saveEmbedding(@NotNull final String content,
return;
}

final Tuple2<Integer, List<Float>> embeddings = embeddingsAPI.pullOrGenerateEmbeddings(content);
final Tuple2<Integer, List<Float>> embeddings = embeddingsAPI.pullOrGenerateEmbeddings(normalizedContent);
if (embeddings._2.isEmpty()) {
Logger.info(this.getClass(), "NO TOKENS for " + contentlet.getContentType().variable() + " content:" + content);
Logger.info(this.getClass(), "NO TOKENS for " + contentlet.getContentType().variable() + " content:" + normalizedContent);
return;
}

Expand All @@ -124,7 +119,7 @@ private void saveEmbedding(@NotNull final String content,
.withTitle(contentlet.getTitle())
.withIdentifier(contentlet.getIdentifier())
.withHost(contentlet.getHost())
.withExtractedText(content)
.withExtractedText(normalizedContent)
.withIndexName(indexName)
.withEmbeddings(embeddings._2).build();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.dotcms.ai.api;

import com.dotcms.ai.rest.forms.EmbeddingsForm;
import com.dotmarketing.portlets.contentlet.model.Contentlet;

import java.util.List;

/**
* The SyncEmbeddingsCallStrategy class is responsible for embedding contentlets in a synchronous manner.
*
* @author vico
*/
public class SyncEmbeddingsCallStrategy implements EmbeddingsCallStrategy {

@Override
public void bulkEmbed(final List<String> inodes, final EmbeddingsForm embeddingsForm) {
new BulkEmbeddingsRunner(inodes, embeddingsForm).run();
}

@Override
public void embed(final EmbeddingsAPIImpl embeddingsAPI,
final Contentlet contentlet,
final String content,
final String indexName) {
new EmbeddingsRunner(embeddingsAPI, contentlet, content, indexName).run();
}
}
59 changes: 33 additions & 26 deletions dotCMS/src/main/java/com/dotcms/ai/db/EmbeddingsFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,35 +39,15 @@ public class EmbeddingsFactory {
public static final Lazy<EmbeddingsFactory> impl = Lazy.of(EmbeddingsFactory::new);

private EmbeddingsFactory() {
initVectorExtension();
initVectorDbTable();
initVector();
}

/**
* Initializes the PGVector extension in the database.
* This method is called when the class is instantiated.
*/
public void initVectorExtension() {
if (!doesExtensionExist()) {
Logger.info(EmbeddingsFactory.class, "Adding PGVector extension to database");
runSQL(EmbeddingsSQL.INIT_VECTOR_EXTENSION);
} else {
Logger.info(EmbeddingsFactory.class, "PGVector exists, skipping extension installation");
}
}

/**
* Initializes the database table for storing embeddings.
* This method is called when the class is instantiated.
*/
public void initVectorDbTable() {
try {
internalInitVectorDbTable();
} catch (Exception e) {
Logger.info(EmbeddingsFactory.class, "Create Table Failed : " + e.getMessage());
moveVectorDbTable();
internalInitVectorDbTable();
}
public void initVector() {
initVectorExtension();
initVectorDbTable();
}

/**
Expand Down Expand Up @@ -119,6 +99,33 @@ private boolean doesExtensionExist() {
}
}

/**
* Initializes the PGVector extension in the database.
* This method is called when the class is instantiated.
*/
private void initVectorExtension() {
if (!doesExtensionExist()) {
Logger.info(EmbeddingsFactory.class, "Adding PGVector extension to database");
runSQL(EmbeddingsSQL.INIT_VECTOR_EXTENSION);
} else {
Logger.info(EmbeddingsFactory.class, "PGVector exists, skipping extension installation");
}
}

/**
* Initializes the database table for storing embeddings.
* This method is called when the class is instantiated.
*/
private void initVectorDbTable() {
try {
internalInitVectorDbTable();
} catch (Exception e) {
Logger.info(EmbeddingsFactory.class, "Create Table Failed : " + e.getMessage());
moveVectorDbTable();
internalInitVectorDbTable();
}
}

/**
* Adds the PGvector type to the SQLConnection
* so it can be used and queried against
Expand Down Expand Up @@ -416,10 +423,10 @@ public long countEmbeddings(final EmbeddingsDTO dto) {
* @return a map of index names to counts
*/
public Map<String, Map<String, Object>> countEmbeddingsByIndex() {
final StringBuilder sql = new StringBuilder(EmbeddingsSQL.COUNT_EMBEDDINGS_BY_INDEX);
final String sql = EmbeddingsSQL.COUNT_EMBEDDINGS_BY_INDEX;

try (final Connection conn = getPGVectorConnection();
final PreparedStatement statement = conn.prepareStatement(sql.toString())) {
final PreparedStatement statement = conn.prepareStatement(sql)) {

final Map<String, Map<String, Object>> results = new TreeMap<>();
final ResultSet rs = statement.executeQuery();
Expand Down
34 changes: 20 additions & 14 deletions dotCMS/src/main/java/com/dotcms/ai/rest/EmbeddingsResource.java
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
package com.dotcms.ai.rest;

import com.dotcms.ai.AiKeys;
import com.dotcms.ai.api.BulkEmbeddingsRunner;
import com.dotcms.ai.api.EmbeddingsAPI;
import com.dotcms.ai.api.EmbeddingsCallStrategy;
import com.dotcms.ai.db.EmbeddingsDTO;
import com.dotcms.ai.rest.forms.CompletionsForm;
import com.dotcms.ai.rest.forms.EmbeddingsForm;
import com.dotcms.ai.util.OpenAIThreadPool;
import com.dotcms.rest.WebResource;
import com.dotmarketing.business.APILocator;
import com.dotmarketing.business.Role;
import com.dotmarketing.common.model.ContentletSearch;
import com.dotmarketing.portlets.contentlet.business.ContentletAPI;
import com.dotmarketing.util.Logger;
import com.dotmarketing.util.UtilMethods;
import com.dotmarketing.util.json.JSONObject;
Expand All @@ -37,6 +37,12 @@
@Path("/v1/ai/embeddings")
public class EmbeddingsResource {

private final ContentletAPI contentletAPI;

public EmbeddingsResource() {
this.contentletAPI = APILocator.getContentletAPI();
}

/**
* Test endpoint for the EmbeddingsResource.
*
Expand Down Expand Up @@ -74,17 +80,12 @@ public final Response embed(@Context final HttpServletRequest request,
final User user = new WebResource.InitBuilder(request, response).requiredBackendUser(true).init().getUser();
long startTime = System.currentTimeMillis();

if (UtilMethods.isEmpty(embeddingsForm.query)) {
return Response.ok("query is required").build();
}

try {
int added = 0;
int newOffset = embeddingsForm.offset;
for (int i = 0; i < 10000; i++) {
// searchIndex(String luceneQuery, int limit, int offset, String sortBy, User user, boolean respectFrontendRoles)
final List<ContentletSearch> searchResults = APILocator
.getContentletAPI()
final List<ContentletSearch> searchResults = contentletAPI
.searchIndex(
embeddingsForm.query + " +live:true",
embeddingsForm.limit,
Expand All @@ -100,15 +101,17 @@ public final Response embed(@Context final HttpServletRequest request,
final List<String> inodes = searchResults
.stream()
.map(ContentletSearch::getInode)
.collect(Collectors.toList());
.collect(Collectors.toUnmodifiableList());
added += inodes.size();
OpenAIThreadPool.submit(new BulkEmbeddingsRunner(inodes,embeddingsForm));

EmbeddingsCallStrategy.resolveStrategy().bulkEmbed(inodes, embeddingsForm);
}

final long totalTime = System.currentTimeMillis() - startTime;
final Map<String, Object> map = Map.of(
AiKeys.TIME_TO_EMBEDDINGS, totalTime + "ms",
AiKeys.TOTAL_TO_EMBED, added, AiKeys.INDEX_NAME, embeddingsForm.indexName);
AiKeys.TOTAL_TO_EMBED, added,
AiKeys.INDEX_NAME, embeddingsForm.indexName);
final ResponseBuilder builder = Response.ok(map, MediaType.APPLICATION_JSON);

return builder.build();
Expand All @@ -124,7 +127,7 @@ public final Response embed(@Context final HttpServletRequest request,
* @param request the HttpServletRequest object.
* @param response the HttpServletResponse object.
* @param json the JSON object containing the data for the embeddings to be deleted.
* @return a Response object containing the result of the embeddings deletion.
* @return a Response object containing the result of the embeddings' deletion.
*/
@DELETE
@JSONP
Expand All @@ -138,8 +141,10 @@ public final Response delete(@Context final HttpServletRequest request,

if (UtilMethods.isSet(() -> json.optString(AiKeys.DELETE_QUERY))){
final int numberDeleted =
EmbeddingsAPI.impl().deleteByQuery(json.optString(AiKeys.DELETE_QUERY),
Optional.ofNullable(json.optString(AiKeys.INDEX_NAME)), user);
EmbeddingsAPI.impl().deleteByQuery(
json.optString(AiKeys.DELETE_QUERY),
Optional.ofNullable(json.optString(AiKeys.INDEX_NAME)),
user);
return Response.ok(Map.of(AiKeys.DELETED, numberDeleted)).build();
}

Expand All @@ -152,6 +157,7 @@ public final Response delete(@Context final HttpServletRequest request,
.withHost(json.optString(AiKeys.SITE))
.build();
int deleted = EmbeddingsAPI.impl().deleteEmbedding(dto);

return Response.ok(Map.of(AiKeys.DELETED, deleted)).build();
}

Expand Down
Loading

0 comments on commit f0d6c64

Please sign in to comment.