Skip to content

Commit

Permalink
solrrag stashing code
Browse files Browse the repository at this point in the history
  • Loading branch information
mhughes2k committed Mar 6, 2024
1 parent 28f46b3 commit b24cbd1
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 74 deletions.
3 changes: 2 additions & 1 deletion search/engine/solr/classes/document.php
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ protected function apply_defaults() {
* @return array
*/
public function export_file_for_engine($file) {
debugging('Using solrrag\document');
$data = $this->export_for_engine();

// Content is index in the main document.
Expand All @@ -202,7 +203,7 @@ public function export_file_for_engine($file) {
$data['solr_fileindexstatus'] = self::INDEXED_FILE_TRUE;
$data['title'] = $file->get_filename();
$data['modified'] = self::format_time_for_engine($file->get_timemodified());

$data['solr_vector'] = 'Hello';
return $data;
}
}
51 changes: 50 additions & 1 deletion search/engine/solrrag/classes/ai/aiprovider.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,67 @@
// We're mocking a core Moodle "AI" Subsystem a la Oauth 2

namespace core\ai;

use \core\persistent;

class AIProvider extends persistent {
// Ultimately this would extend a persistent.
public function __construct($id = 0, stdClass $record = null) {
if ($id > 0) {
$this->raw_set('id', $id);
$this->raw_set('name', "Fake AI Provider");
$this->raw_set('allowembeddings', true);
$this->raw_set('allowquery', true);
}
}

protected static function define_properties()
{
return [
'name' => [
'type' => PARAM_TEXT
],
'allowembeddings' => [
'type' => PARAM_BOOL
],
'allowquery' => [
'type' => PARAM_BOOL
]
];
}

public function use_for_embeddings(): bool {
return $this->get('allowembeddings');
}

public function use_for_query():bool {
return $this->get('allowquery');
}
public function embed_documents(array $documents) {
// Go send the documents off to a back end and then return array of each document's vectors.
print_r($documents);
return [
[0.0053587136790156364,
-0.0004999046213924885,
0.038883671164512634,
-0.003001077566295862,
-0.00900818221271038]
];
}

/**
* @param $document
* @return array
*/
public function embed_query($document): array {
print_r($document);
// Send document to back end and return the vector
return [0.0053587136790156364,
-0.0004999046213924885,
0.038883671164512634,
-0.003001077566295862,
-0.00900818221271038
];
}
/**
* We're overriding this whilst we don't have a real DB table.
* @param $filters
Expand All @@ -27,6 +75,7 @@ protected static function define_properties()
public static function get_records($filters = array(), $sort = '', $order = 'ASC', $skip = 0, $limit = 0) {
$records = [];
$fake = new static(0, (object) [
'id' => 1,
'name' => "Fake AI Provider"
]);
array_push($records, $fake);
Expand Down
3 changes: 3 additions & 0 deletions search/engine/solrrag/classes/ai/api.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@ class api {
public static function get_all_providers() {
return array_values(AIProvider::get_records());
}
public static function get_provider(int $id): AIProvider {
return new AIProvider($id);
}
}
36 changes: 36 additions & 0 deletions search/engine/solrrag/classes/document.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,40 @@ class document extends \search_solr\document {

]
);

/**
* Export the data for the given file in relation to this document.
*
* @param \stored_file $file The stored file we are talking about.
* @return array
*/
public function export_file_for_engine($file) {
$data = $this->export_for_engine();

// Content is index in the main document.
unset($data['content']);
unset($data['description1']);
unset($data['description2']);

// Going to append the fileid to give it a unique id.
$data['id'] = $data['id'].'-solrfile'.$file->get_id();
$data['type'] = \core_search\manager::TYPE_FILE;
$data['solr_fileid'] = $file->get_id();
$data['solr_filecontenthash'] = $file->get_contenthash();
$data['solr_fileindexstatus'] = self::INDEXED_FILE_TRUE;
$data['solr_vector'] = null;
$data['title'] = $file->get_filename();
$data['modified'] = self::format_time_for_engine($file->get_timemodified());

return $data;
}

/**
* Returns the "content" of the documents for embedding.
* This may use some sort of external system.
* @return void
*/
public function fetch_document_contents() {

}
}
218 changes: 146 additions & 72 deletions search/engine/solrrag/classes/engine.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,34 @@

use search_solrrag\document;
use search_solrrag\schema;

// Fudge autoloading!
require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/api.php");
require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/aiprovider.php");
use \core\ai\AIProvider;
class engine extends \search_solr\engine {

/**
* @var AIProvider AI rovider object to use to generate embeddings.
*/
protected ?AIProvider $embeddingprovider = null;

public function __construct(bool $alternateconfiguration = false)
{
// debugging("Solrrag construct");
parent::__construct($alternateconfiguration);
// var_dump($this->config);
// AI Retrieval support.
// Set up AI provider if it's available.
// Ideally we'd be using a Moodle AI provider to tell us which LLM to use for generating embeddings, and
// then simply calling the API and get some results back...but we don't have that yet.
// So we'll fudge this for the moment and leverage an OpenAI Web Service API via a simple HTTP request.
$aiproviderid = 1;
$aiprovider = \core\ai\api::get_provider($aiproviderid);
if ($aiprovider->use_for_embeddings()) {
$this->embeddingprovider = $aiprovider;
}
}

public function is_server_ready() {
public function is_server_ready()
{

$configured = $this->is_server_configured();
if ($configured !== true) {
Expand Down Expand Up @@ -44,84 +62,140 @@ public function is_server_ready() {
}

/**
* @see \search_solr\engine
* @param $document
* Adds a file to the search engine.
*
* Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
* Tika has much better content type detection than Moodle, and we will have many more doc failures
* if we try to send mime types.
*
* @param \search_solr\document $document
* @param \stored_file $storedfile
* @return void
* @throws \core_search\engine_exception
*/
protected function process_document_files($document) {
if (!$this->file_indexing_enabled()) {
protected function add_stored_file($document, $storedfile)
{
$filedoc = $document->export_file_for_engine($storedfile);
/**
* Should we event attempt to get vectors.
*/
if (!is_null($this->embeddingprovider)) {
// garnish $filedoc with the embedding vector. It would be nice if this could be done
// via the export_file_for_engine() call above, that has no awareness of the engine.
$embeddings = $this->embeddingprovider->embed_documents([$filedoc]);
$filedoc['solr_vector'] = "[". implode(",", $embeddings[0]) ."]";
print_r($filedoc);
} else {
// potentially warn that selected provider can't be used for
// generating embeddings for RAG.
}


// Used the underlying implementation

if (!$this->file_is_indexable($storedfile)) {
// For files that we don't consider indexable, we will still place a reference in the search engine.
$filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
$this->add_solr_document($filedoc);
return;
}
// AI Retrieval support.
// Ideally we'd be using a Moodle AI provider to tell us which LLM to use for generating embeddings, and
// then simply calling the API and get some results back...but we don't have that yet.
// So we'll fudge this for the moment and leverage an OpenAI Web Service API via a simple HTTP request.

// Maximum rows to process at a time.
$rows = 500;

// Get the attached files.
$files = $document->get_files();

// If this isn't a new document, we need to check the exiting indexed files.
if (!$document->get_is_new()) {
// We do this progressively, so we can handle lots of files cleanly.
list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
$count = 0;
$idstodelete = array();

do {
// Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
foreach ($indexedfiles as $indexedfile) {
$fileid = $indexedfile->solr_fileid;

if (isset($files[$fileid])) {
// Check for changes that would mean we need to re-index the file. If so, just leave in $files.
// Filelib does not guarantee time modified is updated, so we will check important values.
if ($indexedfile->modified != $files[$fileid]->get_timemodified()) {
continue;
}
if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
continue;
}
if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
continue;
}
if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE &&
$this->file_is_indexable($files[$fileid])) {
// This means that the last time we indexed this file, filtering blocked it.
// Current settings say it is indexable, so we will allow it to be indexed.
continue;
}

// If the file is already indexed, we can just remove it from the files array and skip it.
unset($files[$fileid]);
} else {
// This means we have found a file that is no longer attached, so we need to delete from the index.
// We do it later, since this is progressive, and it could reorder results.
$idstodelete[] = $indexedfile->id;
}
}
$count += $rows;
$curl = $this->get_curl_object();

if ($count < $numfound) {
// If we haven't hit the total count yet, fetch the next batch.
list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
}
$url = $this->get_connection_url('/update/extract');

} while ($count < $numfound);
// Return results as XML.
$url->param('wt', 'xml');

// Delete files that are no longer attached.
foreach ($idstodelete as $id) {
// We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
$this->get_search_client()->deleteById($id);
}
// This will prevent solr from automatically making fields for every tika output.
$url->param('uprefix', 'ignored_');

// Control how content is captured. This will keep our file content clean of non-important metadata.
$url->param('captureAttr', 'true');
// Move the content to a field for indexing.
$url->param('fmap.content', 'solr_filecontent');

// These are common fields that matches the standard *_point dynamic field and causes an error.
$url->param('fmap.media_white_point', 'ignored_mwp');
$url->param('fmap.media_black_point', 'ignored_mbp');

// Copy each key to the url with literal.
// We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
foreach ($filedoc as $key => $value) {
// This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
$url->param('fmap.' . $key, 'ignored_' . $key);
// Place data in a tmp field.
$url->param('literal.mdltmp_' . $key, $value);
// Then move to the final field.
$url->param('fmap.mdltmp_' . $key, $key);
}

// Now we can actually index all the remaining files.
foreach ($files as $file) {
$this->add_stored_file($document, $file);
// This sets the true filename for Tika.
$url->param('resource.name', $storedfile->get_filename());

// A giant block of code that is really just error checking around the curl request.
try {
$requesturl = $url->out(false);
// We have to post the file directly in binary data (not using multipart) to avoid
// Solr bug SOLR-15039 which can cause incorrect data when you use multipart upload.
// Note this loads the whole file into memory; see limit in file_is_indexable().
$result = $curl->post($url->out(false), $storedfile->get_content());

$code = $curl->get_errno();
$info = $curl->get_info();

// Now error handling. It is just informational, since we aren't tracking per file/doc results.
if ($code != 0) {
// This means an internal cURL error occurred error is in result.
$message = 'Curl error ' . $code . ' while indexing file with document id ' . $filedoc['id'] . ': ' . $result . '.';
debugging($message, DEBUG_DEVELOPER);
} else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
// Unexpected HTTP response code.
$message = 'Error while indexing file with document id ' . $filedoc['id'];
// Try to get error message out of msg or title if it exists.
if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
$message .= ': ' . $matches[1];
} else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
$message .= ': ' . $matches[1];
}
// This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
if (CLI_SCRIPT && !PHPUNIT_TEST) {
mtrace($message);
if (debugging()) {
mtrace($requesturl);
}
// Suspiciion that this fails due to the file contents being PDFs.
}
} else {
// Check for the expected status field.
if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
// Now check for the expected status of 0, if not, error.
if ((int)$matches[1] !== 0) {
$message = 'Unexpected Solr status code ' . (int)$matches[1];
$message .= ' while indexing file with document id ' . $filedoc['id'] . '.';
debugging($message, DEBUG_DEVELOPER);
} else {
// The document was successfully indexed.
return;
}
} else {
// We received an unprocessable response.
$message = 'Unexpected Solr response while indexing file with document id ' . $filedoc['id'] . ': ';
$message .= strtok($result, "\n");
debugging($message, DEBUG_DEVELOPER);
}
}
} catch (\Exception $e) {
// There was an error, but we are not tracking per-file success, so we just continue on.
debugging('Unknown exception while indexing file "' . $storedfile->get_filename() . '".', DEBUG_DEVELOPER);
}

// If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
$filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
$this->add_solr_document($filedoc);


// It would have been nice to use the underlying solr code, but its too tightly integrated
// with talking to solr.
//return parent::add_stored_file($document, $storedfile);
}
}

0 comments on commit b24cbd1

Please sign in to comment.