From 22d66cc6d0d53b563097b1b4af45dcda06814c5d Mon Sep 17 00:00:00 2001 From: Michael hughes Date: Sun, 12 May 2024 16:23:38 +0100 Subject: [PATCH] Tweaking similariy search results --- ai/classes/LogLevel.php | 18 + ai/classes/LoggerAwareInterface.php | 7 + ai/classes/LoggerAwareTrait.php | 13 + ai/classes/LoggerInterface.php | 114 +++++ ai/classes/aiclient.php | 73 ++- ai/classes/aiprovider.php | 116 +++-- ai/classes/api.php | 5 +- ai/classes/logger.php | 60 ++- ai/classes/search/testable_manager.php | 15 + ai/cli/query.php | 23 +- course/moodleform_mod.php | 3 +- lang/en/ai.php | 2 + mod/xaichat/mod_form.php | 8 +- mod/xaichat/view.php | 42 +- search/engine/solrrag/classes/document.php | 15 +- search/engine/solrrag/classes/engine.php | 487 ++++++++++++++++----- search/engine/solrrag/settings.php | 15 + 17 files changed, 798 insertions(+), 218 deletions(-) create mode 100644 ai/classes/LogLevel.php create mode 100644 ai/classes/LoggerAwareInterface.php create mode 100644 ai/classes/LoggerAwareTrait.php create mode 100644 ai/classes/LoggerInterface.php create mode 100644 ai/classes/search/testable_manager.php diff --git a/ai/classes/LogLevel.php b/ai/classes/LogLevel.php new file mode 100644 index 0000000000000..08f33622b20ec --- /dev/null +++ b/ai/classes/LogLevel.php @@ -0,0 +1,18 @@ +logger = $logger; + } + public function log($message, array $context = [], $level = LogLevel::INFO) { + $this->logger->log($level, $message, $context); + } +} diff --git a/ai/classes/LoggerInterface.php b/ai/classes/LoggerInterface.php new file mode 100644 index 0000000000000..ed52f7eac9d88 --- /dev/null +++ b/ai/classes/LoggerInterface.php @@ -0,0 +1,114 @@ +provider = $provider; + $this->setLogger($provider->get_logger()); $settings = []; parent::__construct($settings); $this->setHeader('Authorization: Bearer ' . $this->provider->get('apikey')); $this->setHeader('Content-Type: application/json'); } + + public function get_embeddings_url(): string { return $this->provider->get('baseurl') . $this->provider->get('embeddingsurl'); } @@ -40,21 +44,31 @@ public function chat($messages) { ]; $params = json_encode($params); $rawresult = $this->post($this->get_chat_completions_url(), $params); + $this->logger->info("Response rescieved from AI provider: {name}", ['name' => $this->provider->get('name')]); $jsonresult = json_decode($rawresult); if (isset($jsonresult->error)) { + $this->logger->error("Error: " . $jsonresult->error->message . ":". print_r($messages, true)); throw new AiException("Error: " . $jsonresult->error->message . ":". print_r($messages, true)); //return "Error: " . $jsonresult->error->message . ":". print_r($messages, true); } $result = []; if (isset($jsonresult->choices)) { + $this->logger->info("Starting Processing completions"); $result = $this->convert_chat_completion($jsonresult->choices); - if (isset($jsonresult->usage)) { - $this->provider->increment_prompt_usage($jsonresult->usage->prompt_tokens); - $this->provider->increment_completion_tokens($jsonresult->usage->completion_tokens); - $this->provider->increment_total_tokens($jsonresult->usage->total_tokens); - } + $this->logger->info("Finished completions"); } - + if (isset($jsonresult->usage)) { + $this->logger->info("Updating token usage"); + $usage = $jsonresult->usage; + $updated = [ + $this->provider->increment_prompt_usage($usage->prompt_tokens), + $this->provider->increment_completion_tokens($usage->completion_tokens), + $this->provider->increment_total_tokens($usage->total_tokens) + ]; + $this->logger->info("Request Tokens-{prompt_tokens}. Total tokens: {total_tokens}", (array)$usage); + $this->logger->info("Tokens-Prompt:{$updated[0]}, Completion:{$updated[1]}, Total:{$updated[2]}"); + } + //$this->logger->info($result); return $result; } @@ -79,30 +93,43 @@ public function embed_query($content): array { $usedptokens = $this->provider->get_usage('prompt_tokens'); $totaltokens = $this->provider->get_usage('total_tokens'); // mtrace("Prompt tokens: $usedptokens. Total tokens: $totaltokens"); + $content = $content ?? ""; // Fix "null" content to be "empty" string. $params = [ "input" => htmlentities($content), // TODO need to do some length checking here! - "model" => $this->provider->get('embeddingmodel') + "model" => $this->provider->get('embeddingmodel'), ]; $params = json_encode($params); -// var_dump($this->get_embeddings_url()); + $embeddingsurl = $this->get_embeddings_url(); + $this->logger->info("Embeddings URL: " . $embeddingsurl); + $urlisblocked = $this->check_securityhelper_blocklist($embeddingsurl); + if (!is_null($urlisblocked)) { + $this->logger->warning($urlisblocked); + throw new \moodle_exception("{$embeddingsurl} is blocked by policy"); + } + $rawresult = $this->post($embeddingsurl, $params); - $rawresult = $this->post($this->get_embeddings_url(), $params); -// var_dump($rawresult); $result = json_decode($rawresult, true); - // var_dump($result); - $usage = $result['usage']; - $this->provider->increment_prompt_usage($usage['prompt_tokens']); - $this->provider->increment_total_tokens($usage['total_tokens']); - // mtrace("Used Prompt tokens: {$usage['prompt_tokens']}. Total tokens: {$usage['total_tokens']}"); - $data = $result['data']; - foreach($data as $d) { - if ($d['object'] == "embedding") { - return $d['embedding']; + if (is_null($result)) { + throw new \moodle_exception('Failed to decode response from AI provider: {$a}', "", "", $rawresult); + } + if (isset($result['usage'])) { + $usage = $result['usage']; + $updated = [ + $this->provider->increment_prompt_usage($usage['prompt_tokens']), + $this->provider->increment_total_tokens($usage['total_tokens']) + ]; + $this->logger->info("Used Prompt tokens: {prompt_tokens}. Total tokens: {total_tokens}", $usage); + $this->logger->info("Tokens-Prompt:{$updated[0]}, Total:{$updated[1]}"); + } + if (isset($result['data'])) { + $data = $result['data']; + foreach ($data as $d) { + if ($d['object'] == "embedding") { + return $d['embedding']; + } } } - $usedptokens = $this->provider->get_usage('prompt_tokens'); - $totaltokens = $this->provider->get_usage('total_tokens'); - // mtrace("Total Used: Prompt tokens: $usedptokens. Total tokens: $totaltokens"); + return []; } public function embed_documents(array $documents) { diff --git a/ai/classes/aiprovider.php b/ai/classes/aiprovider.php index 5eb9b28a0e80b..151a6ade687df 100644 --- a/ai/classes/aiprovider.php +++ b/ai/classes/aiprovider.php @@ -5,7 +5,8 @@ use core\persistent; use core_course_category; -class AIProvider extends persistent { +class AIProvider extends persistent implements LoggerAwareInterface { + use LoggerAwareTrait; // Ultimately this would extend a persistent. const CONTEXT_ALL_MY_COURSES = -1; @@ -38,6 +39,10 @@ protected static function define_properties() 'embeddingmodel' => [ 'type' => PARAM_ALPHANUMEXT ], + 'embeddingdimensions' => [ + 'type' => PARAM_INT, + 'default' => 1536 + ], 'completionsurl' => [ 'type' => PARAM_URL ], @@ -57,7 +62,24 @@ protected static function define_properties() ]; } + /** + * @param string $prefix + * @return LoggerInterface + * @throws \coding_exception + */ + public function get_logger(string $prefix = "") { + if (is_null($this->logger)) { + $id = $this->get('id'); + $name = "aiprovider-{$id}"; + if (!empty($prefix)) { + $name = $prefix . "-{$name}"; + } + $name .= ".log"; + $this->setLogger(new logger($name)); + } + return $this->logger; + } /** * Work out the context path from the site to this AI Provider's context * @return void @@ -74,17 +96,18 @@ public function use_for_query():bool { return $this->get('allowquery'); } public function get_usage($type) { - return "-"; + $key = [ '$type', $this->get('id'), $this->get('apikey'), ]; + $key = implode("_", $key); $current = get_config('ai', $key); return $current; } public function increment_prompt_usage($change) { - return; + $key = [ 'prompttokens', $this->get('id'), @@ -93,10 +116,12 @@ public function increment_prompt_usage($change) { $key = implode("_", $key); $current = get_config('ai', $key); $new = $current + $change; + $this->logger->info("Incrementing prompt token usage from {$current} to {$new}"); set_config($key, $new, 'ai'); + return $new; } public function increment_completion_tokens($change) { - return; + $key = [ 'completiontokens', $this->get('id'), @@ -105,10 +130,12 @@ public function increment_completion_tokens($change) { $key = implode("_", $key); $current = get_config('ai', $key); $new = $current + $change; + $this->logger->info("Incrementing completion token usage from {$current} to {$new}"); set_config($key, $new, 'ai'); + return $new; } public function increment_total_tokens($change) { - return; + $key = [ 'totaltokens', $this->get('id'), @@ -117,7 +144,9 @@ public function increment_total_tokens($change) { $key = implode("_", $key); $current = get_config('ai', $key); $new = $current + $change; + $this->logger->info("Incrementing total token usage from {$current} to {$new}"); set_config($key, $new, 'ai'); + return $new; } /** @@ -191,39 +220,39 @@ public function get_settings_for_user($user) { */ public static function get_records($filters = [], $sort = '', $order = 'ASC', $skip = 0, $limit = 0) { global $_ENV; -// $records = []; -// $fake = new static(0, (object) [ -// 'id' => 1, -// 'name' => "Open AI Provider (hardcoded)", -// 'enabled' => true, -// 'allowembeddings' => true, -// 'allowchat' => true, -// 'baseurl' => 'https://api.openai.com/v1/', -// 'embeddings' => 'embeddings', -// 'embeddingmodel' => 'text-embedding-3-small', -// 'completions' => 'chat/completions', -// 'completionmodel' => 'gpt-4-turbo-preview', -// 'apikey'=> $_ENV['OPENAIKEY'], -// 'contextid' => \context_system::instance()->id, -// //null, // Global AI Provider -// 'onlyenrolledcourses' => true -// ]); -// array_push($records, $fake); -// $fake = new static(0, (object) [ -// 'id' => 2, -// 'name' => "Ollama AI Provider (hard coded)", -// 'enabled' => true, -// 'allowembeddings' => true, -// 'allowchat' => true, -// 'baseurl' => 'http://127.0.0.1:11434/api/', -// 'embeddings' => 'embeddings', -// 'embeddingmodel' => '', -// 'completions' => 'chat', -// 'completionmodel' => 'llama2', -// 'contextid' => null, // Global AI Provider -// 'onlyenrolledcourses' => true -// ]); -// array_push($records, $fake); +/* $records = []; + $fake = new static(0, (object) [ + 'id' => 1, + 'name' => "Open AI Provider (hardcoded)", + 'enabled' => true, + 'allowembeddings' => true, + 'allowchat' => true, + 'baseurl' => 'https://api.openai.com/v1/', + 'embeddings' => 'embeddings', + 'embeddingmodel' => 'text-embedding-3-small', + 'completions' => 'chat/completions', + 'completionmodel' => 'gpt-4-turbo-preview', + 'apikey'=> $_ENV['OPENAIKEY'], + 'contextid' => \context_system::instance()->id, + //null, // Global AI Provider + 'onlyenrolledcourses' => true + ]); + array_push($records, $fake); + $fake = new static(0, (object) [ + 'id' => 2, + 'name' => "Ollama AI Provider (hard coded)", + 'enabled' => true, + 'allowembeddings' => true, + 'allowchat' => true, + 'baseurl' => 'http://127.0.0.1:11434/api/', + 'embeddings' => 'embeddings', + 'embeddingmodel' => '', + 'completions' => 'chat', + 'completionmodel' => 'llama2', + 'contextid' => null, // Global AI Provider + 'onlyenrolledcourses' => true + ]); + array_push($records, $fake);*/ /* $fake = new static(0, (object) [ 'id' => 3, @@ -258,25 +287,26 @@ public static function get_records($filters = [], $sort = '', $order = 'ASC', $s $records = parent::get_records($filters, $sort, $order, $skip, $limit); $records = array_filter($records, function($record) use ($filters, $targetcontext) { $result = true; + $providercontextid = $record->get('contextid'); + // System provider is already listed. + if ($providercontextid == 0) { + return false; + } foreach($filters as $key => $value) { if ($key == "contextid") { - $providercontextid = $record->get('contextid'); if ($providercontextid == self::CONTEXT_ALL_MY_COURSES) { // More problematic. $result = $result & true; } else if ($providercontextid == 0) { - // System provider so always matches. - $result = $result & true; + return false; } else { $providercontext = \context::instance_by_id( $providercontextid ); $ischild = $targetcontext->is_child_of($providercontext, true); -// debugging("IS child ". (int)$ischild, DEBUG_DEVELOPER); $result = $result & $ischild; } }else { -// debugging('Filtering on '.$key. "' = {$value}", DEBUG_DEVELOPER); if ($record->get($key) != $value) { return false; } diff --git a/ai/classes/api.php b/ai/classes/api.php index 9b024dfb8e156..0503582dfe4de 100644 --- a/ai/classes/api.php +++ b/ai/classes/api.php @@ -8,7 +8,6 @@ * AI Help API. */ class api { - const ACTION_ADD_PROVIDER = "add"; const ACTION_REMOVE_PROVIDER = "remove"; const ACTION_EDIT_PROVIDER = "edit"; @@ -24,9 +23,7 @@ public static function get_all_providers($context = null) { return array_values(aiprovider::get_records()); } public static function get_provider(int $id): AIProvider { - $fakes = aiprovider::get_records(); - return $fakes[0]; // Open AI - // return $fakes[1]; // Ollama + return aiprovider::get_record(['id' => $id]); } /** diff --git a/ai/classes/logger.php b/ai/classes/logger.php index 574532ea781ab..01a01ae4dbd49 100644 --- a/ai/classes/logger.php +++ b/ai/classes/logger.php @@ -1,19 +1,71 @@ logpath = $logdir . '/' . $identifier . '.log'; + $this->logpath = $logdir . '/' . $identifier; + if (!defined('ALREADY_LOGGING')) { + define('ALREADY_LOGGING', true); + $this->write(""); + $this->write("Opening Log file"); + } + } + protected function interpolate($message, array $context = []) { + // build a replacement array with braces around the context keys + $replace = array(); + foreach ($context as $key => $val) { + // check that the value can be cast to string + if (!is_array($val) && (!is_object($val) || method_exists($val, '__toString'))) { + $replace['{' . $key . '}'] = $val; + } + } + + // interpolate replacement values into the message and return + return strtr($message, $replace); } - public function write($message) { + protected function write($message) { $ts = microtime(true); + //%d %B %Y, %I:%M %p + $uts = userdate($ts, '%Y-%M-%d %I:%M:%p'); $f = fopen($this->logpath, 'a'); if(flock($f, LOCK_EX | LOCK_NB)) { - fwrite($f, "{$ts} - {$message}\n"); +// debugging("Writing to log file: {$this->logpath}"); + fwrite($f, "{$ts} - {$uts} - {$message}\n"); flock($f, LOCK_UN); } fclose($f); } + public function emergency($message, array $context = []) { + $this->log(LogLevel::EMERGENCY, $message, $context); + } + public function alert($message, array $context = []) { + $this->log(LogLevel::ALERT, $message, $context); + } + public function critical($message, array $context = []) { + $this->log(LogLevel::CRITICAL, $message, $context); + } + + public function error($message, array $context = []) { + $this->log(LogLevel::ERROR, $message, $context); + } + public function warning($message, array $context = []) { + $this->log(LogLevel::WARNING, $message, $context); + } + public function notice($message, array $context = []) { + $this->log(LogLevel::NOTICE, $message, $context); + } + public function info($message, array $context = []) { + $this->log(LogLevel::INFO, $message, $context); + } + public function debug($message, array $context = []) { + $this->log(LogLevel::DEBUG, $message, $context); + } + public function log($level, $message, array $context = []) { + $message = $this->interpolate($message, $context); + $rawmessage = "{$level} - {$message}"; + $this->write($rawmessage); + } } diff --git a/ai/classes/search/testable_manager.php b/ai/classes/search/testable_manager.php new file mode 100644 index 0000000000000..1bf541d6ba559 --- /dev/null +++ b/ai/classes/search/testable_manager.php @@ -0,0 +1,15 @@ +. diff --git a/ai/cli/query.php b/ai/cli/query.php index 61e70eb59f64f..6d84b407b3fe8 100644 --- a/ai/cli/query.php +++ b/ai/cli/query.php @@ -3,9 +3,12 @@ require_once(__DIR__.'/../../config.php'); require_once($CFG->libdir.'/clilib.php'); +// Load the "testable" version so we can get some of the internals back out. +require_once($CFG->dirroot .'/search/tests/fixtures/testable_core_search.php'); use core_ai\api; use core_ai\aiclient; use core_ai\aiprovider; + [$options, $unrecognized] = cli_get_params( [ 'help' => false, @@ -28,7 +31,9 @@ cli_error(get_string('cliunknowoption', 'admin', $unrecognized)); } $help = <<userprompt = $options['prompt']; $formdata->contextids = []; $formdata->mycoursesonly = false; - $formdata->courseids = $options['courses']; + $formdata->courseids = $courseids; $settings = $provider->get_settings_for_user($user); $settings['userquery'] = $formdata->userprompt; - $settings['courseids'] = $options['courses']; - var_dump($settings); - $vector = $client->embed_query($formdata->userprompt); - $settings['vector'] = $vector; + $settings['courseids'] = $courseids; $limitcourseids = $manager->build_limitcourseids($formdata); $limitcontextids = $formdata->contextids; diff --git a/course/moodleform_mod.php b/course/moodleform_mod.php index c0de9974226ef..45bc0e6d97e0f 100644 --- a/course/moodleform_mod.php +++ b/course/moodleform_mod.php @@ -20,6 +20,7 @@ require_once($CFG->libdir.'/gradelib.php'); require_once($CFG->libdir.'/plagiarismlib.php'); +use core_ai\api; use core_grades\component_gradeitems; /** @@ -1234,7 +1235,7 @@ public function standard_aiprovider_coursemodule_elements( $allowembedding ? get_string("yes") : get_string("no") ); - $providers = \core_ai\api::get_providers( + $providers = api::get_providers( $this->context->id, $allowchat, $allowembedding diff --git a/lang/en/ai.php b/lang/en/ai.php index 7b2ff3f6d9c68..1fc89ed9a5aec 100644 --- a/lang/en/ai.php +++ b/lang/en/ai.php @@ -30,6 +30,8 @@ $string['baseurl_help'] = 'Common shared base URL *without* trailing slash'; $string['enabled'] = 'Enabled'; $string['enabled_help'] = 'Show this AI Provider be available to users.'; +$string['extractorpath'] = 'URL to extractor process'; +$string['extractorpath_desc'] = 'URL to extractor process'; $string['disabled'] = 'Disabled'; $string['general'] = 'General Settings'; $string['removeprovider'] = 'Remove AI Provider'; diff --git a/mod/xaichat/mod_form.php b/mod/xaichat/mod_form.php index c4fce3e8e1e94..f9ded9aebe580 100644 --- a/mod/xaichat/mod_form.php +++ b/mod/xaichat/mod_form.php @@ -60,11 +60,9 @@ public function definition() { $mform->addHelpButton('name', 'xaichatname', 'mod_xaichat'); // Adding the standard "intro" and "introformat" fields. - if ($CFG->branch >= 29) { - $this->standard_intro_elements(); - } else { - $this->add_intro_editor(); - } + + $this->standard_intro_elements(); + $this->standard_aiprovider_coursemodule_elements(true, true); // Add standard grading elements. diff --git a/mod/xaichat/view.php b/mod/xaichat/view.php index c91e4ad1572a7..70a5a89137747 100644 --- a/mod/xaichat/view.php +++ b/mod/xaichat/view.php @@ -63,6 +63,7 @@ if (!($aiprovider = api::get_provider($moduleinstance->aiproviderid))){ throw new moodle_exception("noaiproviderfound", 'xaichat'); } +$logger = $aiprovider->get_logger(); $event = \mod_xaichat\event\course_module_viewed::create(array( 'objectid' => $moduleinstance->id, @@ -93,33 +94,46 @@ // a bunch of system and context specific prompts to constrain behaviour. $totalsteps++; $progress->update(1, $totalsteps,'Processing System Prompts'); + $logger->info("Processing System Prompts"); } $progress->update(1, $totalsteps,'Looking for relevant context'); - $vector = $aiclient->embed_query($data->userprompt); + $logger->info("Looking for relevant context"); $search = \core_search\manager::instance(true, true); + // Some of these values can't be "trusted" to the end user to supply, via something // like a form, nor can they be entirely left to the plugin developer. $settings = $aiprovider->get_settings_for_user($USER); - $settings['vector'] = $vector; $settings['userquery'] = $data->userprompt; + // This limits the plugin's search scope. + $settings['courseids'] = [$course->id]; + $docs = $search->search((object)$settings); - + // Perform "R" from RAG, finding documents from within the context that are similar to the user's prompt. // Add the retrieved documents to the context for this chat by generating some system messages with the content // returned - if (!empty($docs)) { + if (empty($docs)) { + $logger->info("No RAG content returned"); + $prompt = (object)[ + "role" => "system", + "content" => "I wasn't able to find anything relevant about this module" + ]; + $_SESSION[$aicontextkey]['messages'][] = $prompt; + } else { +// print_r($docs); $context = []; + // Remember We've got a search_engine doc here! foreach ($docs as $doc) { - $context[] = $doc->content; + $context[] = $doc->get('content'); } $prompt = (object)[ "role" => "system", "content" => "Use the following context to answer following question:" . implode("\n",$context) ]; $_SESSION[$aicontextkey]['messages'][] = $prompt; - } + } $progress->update(2, $totalsteps,'Attaching user prompt'); - // $_SESSION[$aicontextkey]['messages'][] + $prompt = (object)[ "role" => "user", "content" => $data->userprompt @@ -128,15 +142,21 @@ // Pass the whole context over the AI to summarise. $progress->update(3, $totalsteps, 'Waiting for response'); + $logger->info("Waiting for response from {providername}", ["providername" => $aiprovider->get('name')]); $airesults = $aiclient->chat($_SESSION[$aicontextkey]['messages']); $_SESSION[$aicontextkey]['messages'] = array_merge($_SESSION[$aicontextkey]['messages'],$airesults); - $progress->update(4, $totalsteps, 'Got Response'); + //$progress->update(4, $totalsteps, 'Finished talking to AI'); + $progress->update_full(100,'Finished talking to AI'); + $logger->info("Finished talking to {providername}", ["providername" => $aiprovider->get('name')]); + +// $logger->info("{response}", ['response' => print_r($airesults, 1)]); // We stash the data in the session temporarily (should go into an activity-user store in database) but this // is fast and dirty, and then we do a redirect so that we don't double up the request if the user hit's // refresh. - $next = new \moodle_url('/mod/xaichat/view.php', ['id' => $cm->id]); - redirect($next); +// $next = new \moodle_url('/mod/xaichat/view.php', ['id' => $cm->id]); + + //redirect($next); } else if ($chatform->is_cancelled()) { $_SESSION[$aicontextkey] = [ 'messages'=>[] @@ -177,7 +197,7 @@ echo $OUTPUT->render_from_template("mod_xaichat/conversation", $tcontext); -if (false) { +if (true) { echo html_writer::tag("pre", print_r($_SESSION[$aicontextkey]['messages'],1)); } diff --git a/search/engine/solrrag/classes/document.php b/search/engine/solrrag/classes/document.php index ba33197d8f81b..3f6abf84acefc 100644 --- a/search/engine/solrrag/classes/document.php +++ b/search/engine/solrrag/classes/document.php @@ -40,6 +40,11 @@ class document extends \search_solr\document { 'type' => 'knn_vector_3072', // this field def seems to be related to the size of the LLM embedding too :-( 'stored' => true, 'indexed' => true + ], + 'solr_vector_768' => [ + 'type' => 'knn_vector_768', // this field def seems to be related to the size of the LLM embedding too :-( + 'stored' => true, + 'indexed' => true ] ); @@ -80,6 +85,11 @@ public function fetch_document_contents() { } public function set_data_from_engine($docdata) { $fields = static::$requiredfields + static::$optionalfields + static::$enginefields; + $skipfields = [ + 'solr_vector_1536', + 'solr_vector_3072', + 'solr_vector_768' + ]; foreach ($fields as $fieldname => $field) { // Optional params might not be there. @@ -89,8 +99,9 @@ public function set_data_from_engine($docdata) { $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname])); } else { // No way we can make this work if there is any multivalue field. - if($fieldname === 'solr_vector_1536' || $fieldname === 'solr_vector_3072') { - debugging("Skipping $fieldname"); +// if($fieldname === 'solr_vector_1536' || $fieldname === 'solr_vector_3072') { + if (in_array($fieldname, $skipfields)) { +// debugging("Skipping $fieldname"); continue; } if (is_array($docdata[$fieldname])) { diff --git a/search/engine/solrrag/classes/engine.php b/search/engine/solrrag/classes/engine.php index c7c56d9696e28..0f0b0a86b4a4a 100644 --- a/search/engine/solrrag/classes/engine.php +++ b/search/engine/solrrag/classes/engine.php @@ -3,25 +3,23 @@ namespace search_solrrag; use core_ai\api; -use core_ai\logger; +use core_ai\LoggerAwareTrait; use search_solrrag\document; use search_solrrag\schema; -//require_once($CFG->dirroot . "/search/engine/solrrag/lib.php"); -// // Fudge autoloading! -// require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/api.php"); -// require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/aiprovider.php"); -// require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/aiclient.php"); + use \core_ai\AIProvider; use \core_ai\aiclient; use \core_ai\AiException; -class engine extends \search_solr\engine { + +class engine extends \search_solr\engine implements \core_ai\LoggerAwareInterface { + use LoggerAwareTrait; /** * @var AIProvider AI rovider object to use to generate embeddings. */ protected ?AIClient $aiclient = null; protected ?AIProvider $aiprovider = null; - protected ?logger $logger = null; + public function __construct(bool $alternateconfiguration = false) { parent::__construct($alternateconfiguration); @@ -34,6 +32,7 @@ public function __construct(bool $alternateconfiguration = false) $aiprovider = api::get_provider($aiproviderid); $this->aiprovider = $aiprovider; $this->aiclient = !is_null($aiprovider)? new AIClient($aiprovider) : null; + $this->setLogger($aiprovider->get_logger()); } public function is_server_ready() @@ -76,24 +75,28 @@ public function is_server_ready() */ public function add_document($document, $fileindexing = false) { $docdata = $document->export_for_engine(); - debugging("Adding document"); + $this->logger->info("Adding document to search engine"); if ($this->aiprovider->use_for_embeddings() && $this->aiclient) { - debugging('Generating vector using provider'); + $this->logger->info("Generating vector using document content"); $vector = $this->aiclient->embed_query($document['content']); $vlength = count($vector); $vectorfield = "solr_vector_" . $vlength; + $this->logger->info("Generated vector length: {length}, field: {field}", [ + 'length' => $vlength, 'field' => $vectorfield + ]); $docdata[$vectorfield] = $vector; - // var_dump($docdata); } else { - debugging("Err didn't do any vector stuff!"); + $this->logger->warning("Wasn't able to generate a vector for document"); } if (!$this->add_solr_document($docdata)) { + $this->logger->warning("Failed to add document to search engine index"); return false; } if ($fileindexing) { // This will take care of updating all attached files in the index. + $this->logger->warning("Processing document's files"); $this->process_document_files($document); } @@ -101,19 +104,26 @@ public function add_document($document, $fileindexing = false) { } public function add_document_batch(array $documents, bool $fileindexing = false): array { + $this->logger->info("Entering solrrag::add_document_batch()"); $docdatabatch = []; foreach ($documents as $document) { //$docdatabatch[] = $document->export_for_engine(); $doc = $document->export_for_engine(); if ($this->aiprovider->use_for_embeddings() && $this->aiclient) { - debugging('Generating vector using provider'); + if (empty($doc['content'])) { + $this->logger->info("Empty doc {id} - {title}", ['id' => $doc['id'], 'title' => $doc['title']]); + } + $this->logger->info('Generating vector using provider'); $vector = $this->aiclient->embed_query($doc['content']); $vlength = count($vector); $vectorfield = "solr_vector_" . $vlength; $doc[$vectorfield] = $vector; - // var_dump($doc); + $this->logger->info("Vector length {length} field {field}", [ + 'length' => $vlength, 'field' => $vectorfield + ]); } else { - debugging("Err didn't do any vector stuff!"); + $this->logger->info("Didn't do any vector stuff!"); +// debugging("Err didn't do any vector stuff!"); } $docdatabatch[] = $doc; } @@ -121,15 +131,91 @@ public function add_document_batch(array $documents, bool $fileindexing = false) $resultcounts = $this->add_solr_documents($docdatabatch); // Files are processed one document at a time (if there are files it's slow anyway). + if ($fileindexing) { + $this->logger->info("Processing files"); foreach ($documents as $document) { // This will take care of updating all attached files in the index. $this->process_document_files($document); } + $this->logger->info("Completed Processing files"); } return $resultcounts; } + /** + * Adds multiple text documents to the search engine. + * + * @param array $docs Array of documents (each an array of fields) to add + * @return int[] Array of success, failure, batch count + * @throws \core_search\engine_exception + */ + protected function add_solr_documents(array $docs): array { + $solrdocs = []; + foreach ($docs as $doc) { + $solrdocs[] = $this->create_solr_document($doc); + } + + try { + // Add documents in a batch and report that they all succeeded. + $this->get_search_client()->addDocuments($solrdocs, true, static::AUTOCOMMIT_WITHIN); + return [count($solrdocs), 0, 1]; + } catch (\SolrClientException $e) { + // If there is an exception, fall through... + $donothing = true; + } catch (\SolrServerException $e) { + // If there is an exception, fall through... + $donothing = true; + } + + // When there is an error, we fall back to adding them individually so that we can report + // which document(s) failed. Since it overwrites, adding the successful ones multiple + // times won't hurt. + $success = 0; + $failure = 0; + $batches = 0; + foreach ($docs as $doc) { + $result = $this->add_solr_document($doc); + $batches++; + if ($result) { + $success++; + } else { + $failure++; + } + } + + return [$success, $failure, $batches]; + } + + /** + * Adds a text document to the search engine. + * + * @param array $doc + * @return bool + */ + protected function add_solr_document($doc) { + $solrdoc = $this->create_solr_document($doc); + + try { + $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN); + return true; + } catch (\SolrClientException $e) { + debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER); + $this->logger->error('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage()); + } catch (\SolrServerException $e) { + // We only use the first line of the message, as it's a fully java stacktrace behind it. + // $msg = strtok($e->getMessage(), "\n"); + $msg = $e->getMessage(); + debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER); + $this->logger->error('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg); + $msgdoc = $doc; + unset($msgdoc['solr_vector_768']); + $this->logger->debug(print_r($msgdoc, true)); + + } + + return false; + } /** * Adds a file to the search engine. @@ -142,8 +228,11 @@ public function add_document_batch(array $documents, bool $fileindexing = false) * @param \stored_file $storedfile * @return void */ - protected function add_stored_file($document, $storedfile) - { + protected function add_stored_file($document, $storedfile) { + $this->logger->info("Adding stored file {name} to document {document}", [ + "name" => $storedfile->get_filename(), + "document" => "TBD" + ]); $embeddings = []; $filedoc = $document->export_file_for_engine($storedfile); @@ -151,6 +240,7 @@ protected function add_stored_file($document, $storedfile) if (!$this->file_is_indexable($storedfile)) { // For files that we don't consider indexable, we will still place a reference in the search engine. + $this->logger->warning("File {filename} is not indexable", ['filename' => $storedfile->get_filename()]); $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE; $this->add_solr_document($filedoc); return; @@ -190,20 +280,20 @@ protected function add_stored_file($document, $storedfile) $url->param('resource.name', $storedfile->get_filename()); // If we're not doing embeddings, then we can just use the "original" implementation which will // extract and index the file without passing the content back. - if (!$this->aiprovider->use_for_embeddings()) { - $url->param('extractOnly', "true"); + if ($this->aiprovider->use_for_embeddings()) { + $this->logger->info("Extracting file content without embeddings"); + $url->param('extractOnly', "true"); // This gets solr to extract the content but not write it to the index. } // A giant block of code that is really just error checking around the curl request. try { $requesturl = $url->out(false); - debugging($requesturl); + $this->logger->info("Attempting to extract resource content"); // We have to post the file directly in binary data (not using multipart) to avoid // Solr bug SOLR-15039 which can cause incorrect data when you use multipart upload. // Note this loads the whole file into memory; see limit in file_is_indexable(). $result = $curl->post($requesturl, $storedfile->get_content()); - //$url->out(false) $code = $curl->get_errno(); $info = $curl->get_info(); @@ -213,6 +303,7 @@ protected function add_stored_file($document, $storedfile) // This means an internal cURL error occurred error is in result. $message = 'Curl error ' . $code . ' while indexing file with document id ' . $filedoc['id'] . ': ' . $result . '.'; debugging($message, DEBUG_DEVELOPER); + $this->logger->error($message); } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) { // Unexpected HTTP response code. $message = 'Error while indexing file with document id ' . $filedoc['id']; @@ -225,8 +316,9 @@ protected function add_stored_file($document, $storedfile) // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter. if (CLI_SCRIPT && !PHPUNIT_TEST) { mtrace($message); + $this->logger->warning($message); if (debugging()) { - mtrace($requesturl); + $this->logger->debug($requesturl); } // Suspiciion that this fails due to the file contents being PDFs. } @@ -237,44 +329,72 @@ protected function add_stored_file($document, $storedfile) if ((int)$matches[1] !== 0) { $message = 'Unexpected Solr status code ' . (int)$matches[1]; $message .= ' while indexing file with document id ' . $filedoc['id'] . '.'; - debugging($message, DEBUG_DEVELOPER); + $this->logger->warning($message); } else { - // The document was successfully indexed. + // The document was successfully extracted. if ($this->aiprovider->use_for_embeddings() && $this->aiclient) { - preg_match('/(?.*)<\/str>/imsU', $result, $streamcontent); - debugging("Got SOLR update/extract response"); - if ($streamcontent[1]!== 0) { - $xmlcontent = html_entity_decode($streamcontent[1]); - $xml = simplexml_load_string($xmlcontent); - $filedoc['content'] = (string)$xml->body->asXML(); - $metadata = $xml->head->meta; - foreach($metadata as $meta) { - $name = (string)$meta['name']; - $content = (string)$meta['content']; - if ($content != null) { - $filedoc[$name] = $content; + $matchresult = preg_match('/(?.*)<\/str>/imsU', $result, $streamcontent); + if ($matchresult === 0) { + $this->logger->error("Didn't get an extraction response"); + $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR; + $this->logger->debug($requesturl); + $this->logger->debug($result); + + } else { + $this->logger->info('document extracted successfully'); + $xmlcontent = html_entity_decode($streamcontent[1]); + $this->logger->debug($xmlcontent); + try { + $xml = simplexml_load_string($xmlcontent); + if ($xml === false) { + $this->logger->error("Didn't get back a valid XML response"); + $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR; } else { - $filedoc[$name] = ""; - + $filedoc['content'] = (string)$xml->body->asXML(); + $metadata = $xml->head->meta; + foreach ($metadata as $meta) { + $name = (string)$meta['name']; + $content = (string)$meta['content']; + if ($content != null) { + $filedoc[$name] = $content; + } else { + $filedoc[$name] = ""; + + } + } + // Note a successful extraction in the log + $this->logger->info("Successfully extracted content from file {filename}", [ + 'filename' => $storedfile->get_filename() + ]); + } + /** + * Since solr has given us back the content, we can now send it off to the AI provider. + */ + // garnish $filedoc with the embedding vector. It would be nice if this could be done + // via the export_file_for_engine() call above, that has no awareness of the engine. + // We expect $filedoc['content'] to be set. + if(isset($filedoc['content'])) { + $this->logger->info("Extracting vector from content {$content}", $filedoc); + $vector = $this->aiclient->embed_query($filedoc['content']); + $vlength = count($vector); + $vectorfield = "solr_vector_" . $vlength; + $filedoc[$vectorfield] = $vector; + $this->logger->info("Generated vector length: {length}, field: {field}", [ + 'length' => $vlength, 'field' => $vectorfield + ]); + } else { + $this->logger->info("Document had no content", $filedoc); } + $this->logger->info("Solr dor: {doc}", ["doc" => print_r($filedoc,true)]); + } catch (\Exception $e) { + $this->logger->error("Error parsing XML from solr"); + $this->logger->debug($xmlcontent); } + } - /** - * Since solr has given us back the content, we can now send it off to the AI provider. - */ - - // garnish $filedoc with the embedding vector. It would be nice if this could be done - // via the export_file_for_engine() call above, that has no awareness of the engine. - // We expect $filedoc['content'] to be set. - $vector = $this->aiclient->embed_query($filedoc['content']); - $vlength = count($vector); - $vectorfield = "solr_vector_" . $vlength; - $filedoc[$vectorfield] = $vector; - } else { - // As before if embeddings is not in use, then we can bail - // as the document is already indexed. - return; } + // We can add either the document with content or without. + $this->logger->info("Adding document to search index."); $this->add_solr_document($filedoc); return; } @@ -283,11 +403,13 @@ protected function add_stored_file($document, $storedfile) $message = 'Unexpected Solr response while indexing file with document id ' . $filedoc['id'] . ': '; $message .= strtok($result, "\n"); debugging($message, DEBUG_DEVELOPER); + $this->logger->warning($message); } } } catch (\Exception $e) { // There was an error, but we are not tracking per-file success, so we just continue on. debugging('Unknown exception while indexing file "' . $storedfile->get_filename() . '".', DEBUG_DEVELOPER); + $this->logger->error($message); } // If we get here, the document was not indexed due to an error. So we will index just the base info without the file. @@ -305,6 +427,7 @@ protected function add_stored_file($document, $storedfile) protected function create_solr_document(array $doc): \SolrInputDocument { $solrdoc = new \SolrInputDocument(); + $forcetostring = ["dc_title", "Object_Name"]; // Replace underlines in the content with spaces. The reason for this is that for italic // text, content_to_text puts _italic_ underlines. Solr treats underlines as part of the // word, which means that if you search for a word in italic then you can't find it. @@ -318,11 +441,23 @@ protected function create_solr_document(array $doc): \SolrInputDocument { continue; } if (is_array($value)) { + $i = 0; foreach ($value as $v) { + if (empty($v)) { + $this->logger->debug("Field {name} pos {i} is empty", ["name" => $field, "i" => $i]); + } $solrdoc->addField($field, $v); + $i++; } continue; } + if (empty($value)) { + $this->logger->debug("Field {name} is empty", ["name" => $field]); + } + if (in_array($field, $forcetostring)) { + $this->logger->debug("Forcing {name} to string", ["name" => $field]); + $value = "{$value}"; + } $solrdoc->addField($field, $value); } @@ -337,46 +472,27 @@ protected function create_solr_document(array $doc): \SolrInputDocument { * @throws \core_search\engine_exception */ public function execute_query($filters, $accessinfo, $limit = 0) { - + $this->logger->info("Entering execute_query"); if (isset($filters->similarity) && $filters->similarity ) { // Do a vector similarity search. - // debugging("Running similarity search", DEBUG_DEVELOPER); + $this->logger->info("Running similarity search"); + $this->logger->info("Fetching Vector for {userquery}", (array)$filters); + $vector = $this->aiclient->embed_query($filters->userquery); + $filters->vector = $vector; // We may get accessinfo, but we actually should determine our own ones to apply too // But we can't access the "manager" class' get_areas_user_accesses function, and // that's already been called based on the configuration / data from the user $docs = $this->execute_similarity_query($filters, $accessinfo, $limit); - var_dump($docs); + // Really should run a process similar to the process_response() function. + return $docs; } else { - // debugging("Running regular search", DEBUG_DEVELOPER); - // print_r($filters); - // print_r($accessinfo); + $this->logger->info("Executing regular search"); return parent::execute_query($filters, $accessinfo, $limit); } } - /** - * A logging function just to allow us to output all the things - * that the process is doing for verification / validation. - * - * Probably not the most efficient way to do this, but Moodle's lacking - * a good generic logging framework. - * - * @param mixed $message An object/array/string that will be turned into a string. - */ - protected function log($message) { - $logfiledir = make_temp_directory('search_solrrag'); - $file = $logfiledir . '/solr_knn_query.log'; - $log = fopen($file, 'a'); - if (is_object($message)) { - $message = print_r($message, true); - } else if (is_array($message)) { - $message = print_r($message, true); - } - fwrite($log, date('Y-m-d H:i:s') . " " . $message . "\n"); - fclose($log); - } /** * Perform a similarity search against the backend. @@ -393,9 +509,7 @@ protected function log($message) { */ public function execute_similarity_query(\stdClass $filters, \stdClass $accessinfo, int $limit = null) { $data = clone($filters); - $this->log("Executing SOLR KNN QUery"); - $this->log("Filters"); - $this->log($filters); + $this->logger->info("Executing SOLR KNN QUery"); $vector = $filters->vector; $topK = $limit > 0 ? $limit: 1; // We'll make the number of neighbours the same as search result limit. @@ -406,17 +520,22 @@ public function execute_similarity_query(\stdClass $filters, \stdClass $accessin $field = "solr_vector_" . count($vector); $requestbody = "{!knn f={$field} topK={$topK}}[" . implode(",", $vector) . "]"; - $this->log($requestbody); + $filters->mainquery = $requestbody; // Build filter restrictions. $filterqueries = []; if(!empty($data->areaids)) { - $filterqueries[] = '{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')'; + $r = '{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')'; + $this->logger->info("Attaching areid restriction: {areaid}", ['areaid' => $r]); + $filterqueries[] = $r; } - + $r = null; if(!empty($data->excludeareaids)) { - $filterqueries[] = '{!cache=false}-areaid:(' . implode(' OR ', $data->excludeareaids) . ')'; + $r ='{!cache=false}-areaid:(' . implode(' OR ', $data->excludeareaids) . ')'; + $this->logger->info("Attaching areid restriction: {areaid}", ['areaid' => $r]); + $filterqueries[] = $r; } + $r = null; // Build access restrictions. // And finally restrict it to the context where the user can access, we want this one cached. @@ -436,12 +555,15 @@ public function execute_similarity_query(\stdClass $filters, \stdClass $accessin } } if (empty($allcontexts)) { + $this->logger->warning("User has no contexts at all"); // This means there are no valid contexts for them, so they get no results. return null; } - $filterqueries[] = 'contextid:(' . implode(' OR ', $allcontexts) . ')'; + $contexts ='contextid:(' . implode(' OR ', $allcontexts) . ')'; + $this->logger->info("Attaching context restriction: {contexts}", ['contexts' => $contexts]); + $filterqueries[] = $contexts; } - + $r = null; if (!$accessinfo->everything && $accessinfo->separategroupscontexts) { // Add another restriction to handle group ids. If there are any contexts using separate // groups, then results in that context will not show unless you belong to the group. @@ -462,48 +584,62 @@ public function execute_similarity_query(\stdClass $filters, \stdClass $accessin if ($accessinfo->usergroups) { // Either the document has no groupid, or the groupid is one that the user // belongs to, or the context is not one of the separate groups contexts. - $filterqueries[] = '(*:* -groupid:[* TO *]) OR ' . + $r = '(*:* -groupid:[* TO *]) OR ' . 'groupid:(' . implode(' OR ', $accessinfo->usergroups) . ') OR ' . '(*:* -contextid:(' . implode(' OR ', $accessinfo->separategroupscontexts) . '))' . $exceptions; + $this->logger->info("attaching usergroup restriction: {usergroups}", ['usergroups' => $r]); + $filterqueries[] = $r; } else { // Either the document has no groupid, or the context is not a restricted one. - $filterqueries[] = '(*:* -groupid:[* TO *]) OR ' . + $r = '(*:* -groupid:[* TO *]) OR ' . '(*:* -contextid:(' . implode(' OR ', $accessinfo->separategroupscontexts) . '))' . $exceptions; + $this->logger->info("attaching usergroup restriction: {usergroups}", ['usergroups' => $r]); + $filterqueries[] = $r; } } + $params = [ + "query" => $requestbody, + ]; + // Query String parameters. + $qsparams = []; + if ($this->file_indexing_enabled()) { // Now group records by solr_filegroupingid. Limit to 3 results per group. - // TODO work out how to convert the following into query / filter parameters. - // $query->setGroup(true); - // $query->setGroupLimit(3); - // $query->setGroupNGroups(true); - // $query->addGroupField('solr_filegroupingid'); + // TODO work out how to convert the following into query / filter parameters.# + $this->logger->info("Setting SOLR group parameters"); + $qsparams['group'] = "true"; + $qsparams['group.limit'] = 3; + $qsparams['group.ngroups'] = "true"; + $qsparams['group.field'] = 'solr_filegroupingid'; } else { // Make sure we only get text files, in case the index has pre-existing files. $filterqueries[] = 'type:'.\core_search\manager::TYPE_TEXT; } - // Finally perform the actaul search + // Finally perform the actual search. $curl = $this->get_curl_object(); $requesturl = $this->get_connection_url('/select'); - $requesturl->param('fl', 'id,areaid,score,content'); +// $requesturl->param('fl', 'id,areaid,score,content, title'); + // Title is added on the end so we didn't have to recode some indexes below. $requesturl->param('wt', 'xml'); + foreach($qsparams as $qs => $value) { + $requesturl->param($qs, $value); + } foreach($filterqueries as $fq) { $requesturl->param('fq', $fq); } -// $requesturl->param('fq', implode("&", $filterqueries)); - - $params = [ - "query" => $requestbody, - ]; $curl->setHeader('Content-type: application/json'); + $this->logger->info("Solr request: ".$requesturl->out(false)); + $logparams =$params; + unset($logparams['query']); // unset query as it's got the full vector in it. + $this->logger->info("Solr request params: ". json_encode($logparams)); $result = $curl->post($requesturl->out(false), json_encode($params)); - $this->log($result); + $this->logger->info("Got SOLR result"); // Probably have to duplicate error handling code from the add_stored_file() function. $code = $curl->get_errno(); @@ -514,6 +650,7 @@ public function execute_similarity_query(\stdClass $filters, \stdClass $accessin $message = 'Curl error ' . $code . ' retrieving'; // . $filedoc['id'] . ': ' . $result . '.'; debugging($message, DEBUG_DEVELOPER); + $this->logger->error($message); } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) { // Unexpected HTTP response code. $message = 'Error while querying for documents ' ; @@ -526,8 +663,10 @@ public function execute_similarity_query(\stdClass $filters, \stdClass $accessin // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter. if (CLI_SCRIPT && !PHPUNIT_TEST) { mtrace($message); + $this->logger->warning($message); if (debugging()) { mtrace($requesturl); + $this->logger->info($requesturl); } // Suspiciion that this fails due to the file contents being PDFs. } @@ -538,34 +677,64 @@ public function execute_similarity_query(\stdClass $filters, \stdClass $accessin // Now check for the expected status of 0, if not, error. if ((int)$matches[1] !== 0) { $message = 'Unexpected Solr status code ' . (int)$matches[1]; - debugging($message, DEBUG_DEVELOPER); + $this->logger->warning($message); } else { + $this->logger->info("Parsing solr result"); // We got a result back. // echo htmlentities($result); // debugging("Got SOLR update/extract response"); $xml = simplexml_load_string($result); - // echo "
";
-                    // var_dump($xml->result);
-                    // echo "
"; - $results = $xml->result->doc; + if ($this->file_indexing_enabled()) { + $this->logger->info("File indexing enabled"); + // We'll just grab all of the elements that were found. + $results = $xml->xpath("//doc"); +// $this->logger->debug(print_r($results, true)); + } else { + $results = $xml->result->doc; +// $this->logger->debug($result); + } $docs = []; - foreach($results as $doc) { - $docs[] = (object)[ - 'id' => (string)$doc->str[0], - 'areaid' => (string)$doc->str[1], - 'content' => (string)$doc->str[2], - 'score' => (string)$doc->float, - ]; + $titles = []; + if (!empty($results)) { +// echo "
";
+                        foreach ($results as $result) {
+                            $result->rewind();
+                            $doc = [];
+                            while($result->valid()) {
+                                $element = $result->current();
+                                $name = (string)$element["name"];
+                                $doc[$name] = trim((string)$element);
+                                $result->next();
+                            }
+                            $this->logger->debug("Outputting similarity search results");
+                            $this->logger->debug(print_r($doc, true));
+                            $searcharea = $this->get_search_area($doc['areaid']);
+                            $titles[] = $doc['title'];
+                            $doc = $this->to_document($searcharea, $doc);
+
+                            // we're now a "Document" object, so check for content.
+                            if ($doc->is_set('content')) {
+                                $docs[] = $doc;
+                            } else {
+                                $this->logger->info("Document {title} had no content in the end", ['title' => $doc->get('title')]);
+                            }
+                        }
+//                        echo "
"; + // Just for audit/debugging we output the list of resource titles. + $this->logger->info("Document titles: {titles}", ['titles'=> implode(",", $titles)]); + } else { + $this->logger->info("No results found"); } + return $docs; - // [0][1][2] as defined in the fl attribute above - + } } else { // We received an unprocessable response. $message = 'Unexpected Solr response'; $message .= strtok($result, "\n"); debugging($message, DEBUG_DEVELOPER); + $this->logger->warning($message); } } return []; @@ -606,4 +775,86 @@ protected function update_schema($oldversion, $newversion) { return true; } + /** + * Index files attached to the docuemnt, ensuring the index matches the current document files. + * + * For documents that aren't known to be new, we check the index for existing files. + * - New files we will add. + * - Existing and unchanged files we will skip. + * - File that are in the index but not on the document will be deleted from the index. + * - Files that have changed will be re-indexed. + * + * @param \search_solr\document $document + */ + protected function process_document_files($document) { + if (!$this->file_indexing_enabled()) { + return; + } + + // Maximum rows to process at a time. + $rows = 500; + + // Get the attached files. + $files = $document->get_files(); + + // If this isn't a new document, we need to check the exiting indexed files. + if (!$document->get_is_new()) { + // We do this progressively, so we can handle lots of files cleanly. + list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows); + $count = 0; + $idstodelete = array(); + + do { + // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones. + foreach ($indexedfiles as $indexedfile) { + $fileid = $indexedfile->solr_fileid; + + if (isset($files[$fileid])) { + // Check for changes that would mean we need to re-index the file. If so, just leave in $files. + // Filelib does not guarantee time modified is updated, so we will check important values. + if ($indexedfile->modified != $files[$fileid]->get_timemodified()) { + continue; + } + if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) { + continue; + } + if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) { + continue; + } + if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE && + $this->file_is_indexable($files[$fileid])) { + // This means that the last time we indexed this file, filtering blocked it. + // Current settings say it is indexable, so we will allow it to be indexed. + continue; + } + + // If the file is already indexed, we can just remove it from the files array and skip it. + unset($files[$fileid]); + } else { + // This means we have found a file that is no longer attached, so we need to delete from the index. + // We do it later, since this is progressive, and it could reorder results. + $idstodelete[] = $indexedfile->id; + } + } + $count += $rows; + + if ($count < $numfound) { + // If we haven't hit the total count yet, fetch the next batch. + list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows); + } + + } while ($count < $numfound); + + // Delete files that are no longer attached. + foreach ($idstodelete as $id) { + // We directly delete the item using the client, as the engine delete_by_id won't work on file docs. + $this->get_search_client()->deleteById($id); + } + } + + // Now we can actually index all the remaining files. + foreach ($files as $file) { + $this->add_stored_file($document, $file); + } + } } diff --git a/search/engine/solrrag/settings.php b/search/engine/solrrag/settings.php index a17b7227b24b5..fb7dfacdb90f4 100644 --- a/search/engine/solrrag/settings.php +++ b/search/engine/solrrag/settings.php @@ -53,6 +53,21 @@ "", $optproviders )); + $optextractors = [ + "solrtika" => "Solr with internal Tika", + 'tika' => "Standalone Tika" + ]; + $settings->add(new admin_setting_configselect( + 'search_solrrag/extractor', + 'Choose File Content extractor', + 'List of File Content Extractors', + "", + $optextractors + )); + $settings->add(new admin_setting_configtext('search_solrrag/extractorurl', + new lang_string('extractorpath', 'search_solrrag'), + new lang_string('extractorpath_desc', 'search_solrrag'), '', PARAM_RAW)); + $settings->add(new admin_setting_heading('search_solrrag_connection', new lang_string('connectionsettings', 'search_solrrag'), '')); $settings->add(new admin_setting_configtext('search_solrrag/server_hostname', new lang_string('solrserverhostname', 'search_solrrag'), new lang_string('solrserverhostname_desc', 'search_solrrag'), '127.0.0.1', PARAM_HOST));