solrrag stashing code

mhughes2k · Mar 6, 2024 · b24cbd1 · b24cbd1
1 parent 28f46b3
commit b24cbd1
Show file tree

Hide file tree

Showing 5 changed files with 237 additions and 74 deletions.
diff --git a/search/engine/solr/classes/document.php b/search/engine/solr/classes/document.php
@@ -187,6 +187,7 @@ protected function apply_defaults() {
      * @return array
      */
     public function export_file_for_engine($file) {
+        debugging('Using solrrag\document');
         $data = $this->export_for_engine();
 
         // Content is index in the main document.
@@ -202,7 +203,7 @@ public function export_file_for_engine($file) {
         $data['solr_fileindexstatus'] = self::INDEXED_FILE_TRUE;
         $data['title'] = $file->get_filename();
         $data['modified'] = self::format_time_for_engine($file->get_timemodified());
-
+        $data['solr_vector'] = 'Hello';
         return $data;
     }
 }
diff --git a/search/engine/solrrag/classes/ai/aiprovider.php b/search/engine/solrrag/classes/ai/aiprovider.php
@@ -2,19 +2,67 @@
 // We're mocking a core Moodle "AI" Subsystem a la Oauth 2
 
 namespace core\ai;
-
+use \core\persistent;
 
 class AIProvider extends persistent {
 // Ultimately this would extend a persistent.
+    public function __construct($id = 0, stdClass $record = null) {
+        if ($id > 0) {
+            $this->raw_set('id', $id);
+            $this->raw_set('name', "Fake AI Provider");
+            $this->raw_set('allowembeddings', true);
+            $this->raw_set('allowquery', true);
+        }
+    }
+
     protected static function define_properties()
     {
         return [
             'name' => [
                 'type' => PARAM_TEXT
+            ],
+            'allowembeddings' => [
+                'type' => PARAM_BOOL
+            ],
+            'allowquery' => [
+                'type' => PARAM_BOOL
             ]
         ];
     }
 
+    public function use_for_embeddings(): bool {
+        return $this->get('allowembeddings');
+    }
+
+    public function use_for_query():bool {
+        return $this->get('allowquery');
+    }
+    public function embed_documents(array $documents) {
+        // Go send the documents off to a back end and then return array of each document's vectors.
+        print_r($documents);
+        return [
+            [0.0053587136790156364,
+                -0.0004999046213924885,
+                0.038883671164512634,
+                -0.003001077566295862,
+                -0.00900818221271038]
+        ];
+    }
+
+    /**
+     * @param $document
+     * @return array
+     */
+    public function embed_query($document): array {
+        print_r($document);
+        // Send document to back end and return the vector
+        return [0.0053587136790156364,
+            -0.0004999046213924885,
+            0.038883671164512634,
+            -0.003001077566295862,
+            -0.00900818221271038
+        ];
+    }
     /**
      * We're overriding this whilst we don't have a real DB table.
      * @param $filters
@@ -27,6 +75,7 @@ protected static function define_properties()
     public static function get_records($filters = array(), $sort = '', $order = 'ASC', $skip = 0, $limit = 0) {
         $records = [];
         $fake = new static(0, (object) [
+            'id' => 1,
             'name' => "Fake AI Provider"
         ]);
         array_push($records, $fake);

diff --git a/search/engine/solrrag/classes/ai/api.php b/search/engine/solrrag/classes/ai/api.php
@@ -8,4 +8,7 @@ class api {
     public static function get_all_providers() {
         return array_values(AIProvider::get_records());
     }
+    public static function get_provider(int $id): AIProvider {
+        return new AIProvider($id);
+    }
 }
diff --git a/search/engine/solrrag/classes/document.php b/search/engine/solrrag/classes/document.php
@@ -38,4 +38,40 @@ class document extends \search_solr\document {
 
         ]
     );
+
+    /**
+     * Export the data for the given file in relation to this document.
+     *
+     * @param \stored_file $file The stored file we are talking about.
+     * @return array
+     */
+    public function export_file_for_engine($file) {
+        $data = $this->export_for_engine();
+
+        // Content is index in the main document.
+        unset($data['content']);
+        unset($data['description1']);
+        unset($data['description2']);
+
+        // Going to append the fileid to give it a unique id.
+        $data['id'] = $data['id'].'-solrfile'.$file->get_id();
+        $data['type'] = \core_search\manager::TYPE_FILE;
+        $data['solr_fileid'] = $file->get_id();
+        $data['solr_filecontenthash'] = $file->get_contenthash();
+        $data['solr_fileindexstatus'] = self::INDEXED_FILE_TRUE;
+        $data['solr_vector'] = null;
+        $data['title'] = $file->get_filename();
+        $data['modified'] = self::format_time_for_engine($file->get_timemodified());
+
+        return $data;
+    }
+
+    /**
+     * Returns the "content" of the documents for embedding.
+     * This may use some sort of external system.
+     * @return void
+     */
+    public function fetch_document_contents() {
+
+    }
 }
diff --git a/search/engine/solrrag/classes/engine.php b/search/engine/solrrag/classes/engine.php
@@ -4,16 +4,34 @@
 
 use search_solrrag\document;
 use search_solrrag\schema;
-
+// Fudge autoloading!
+require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/api.php");
+require_once($CFG->dirroot ."/search/engine/solrrag/classes/ai/aiprovider.php");
+use \core\ai\AIProvider;
 class engine extends \search_solr\engine {
+
+    /**
+     * @var AIProvider AI rovider object to use to generate embeddings.
+     */
+    protected ?AIProvider $embeddingprovider = null;
+
     public function __construct(bool $alternateconfiguration = false)
     {
-//        debugging("Solrrag construct");
         parent::__construct($alternateconfiguration);
-//        var_dump($this->config);
+        // AI Retrieval support.
+        // Set up AI provider if it's available.
+        // Ideally we'd be using a Moodle AI provider to tell us which LLM to use for generating embeddings, and
+        // then simply calling the API and get some results back...but we don't have that yet.
+        // So we'll fudge this for the moment and leverage an OpenAI Web Service API via a simple HTTP request.
+        $aiproviderid = 1;
+        $aiprovider = \core\ai\api::get_provider($aiproviderid);
+        if ($aiprovider->use_for_embeddings()) {
+            $this->embeddingprovider = $aiprovider;
+        }
     }
 
-    public function is_server_ready() {
+    public function is_server_ready()
+    {
 
         $configured = $this->is_server_configured();
         if ($configured !== true) {
@@ -44,84 +62,140 @@ public function is_server_ready() {
     }
 
     /**
-     * @see \search_solr\engine
-     * @param $document
+     * Adds a file to the search engine.
+     *
+     * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
+     * Tika has much better content type detection than Moodle, and we will have many more doc failures
+     * if we try to send mime types.
+     *
+     * @param \search_solr\document $document
+     * @param \stored_file $storedfile
      * @return void
-     * @throws \core_search\engine_exception
      */
-    protected function process_document_files($document) {
-        if (!$this->file_indexing_enabled()) {
+    protected function add_stored_file($document, $storedfile)
+    {
+        $filedoc = $document->export_file_for_engine($storedfile);
+        /**
+         * Should we event attempt to get vectors.
+         */
+        if (!is_null($this->embeddingprovider)) {
+            // garnish $filedoc with the embedding vector. It would be nice if this could be done
+            // via the export_file_for_engine() call above, that has no awareness of the engine.
+            $embeddings = $this->embeddingprovider->embed_documents([$filedoc]);
+            $filedoc['solr_vector'] = "[". implode(",", $embeddings[0]) ."]";
+            print_r($filedoc);
+        } else {
+            // potentially warn that selected provider can't be used for
+            // generating embeddings for RAG.
+        }
+
+
+        // Used the underlying implementation
+
+        if (!$this->file_is_indexable($storedfile)) {
+            // For files that we don't consider indexable, we will still place a reference in the search engine.
+            $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
+            $this->add_solr_document($filedoc);
             return;
         }
-        // AI Retrieval support.
-        // Ideally we'd be using a Moodle AI provider to tell us which LLM to use for generating embeddings, and
-        // then simply calling the API and get some results back...but we don't have that yet.
-        // So we'll fudge this for the moment and leverage an OpenAI Web Service API via a simple HTTP request.
 
-        // Maximum rows to process at a time.
-        $rows = 500;
-
-        // Get the attached files.
-        $files = $document->get_files();
-
-        // If this isn't a new document, we need to check the exiting indexed files.
-        if (!$document->get_is_new()) {
-            // We do this progressively, so we can handle lots of files cleanly.
-            list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
-            $count = 0;
-            $idstodelete = array();
-
-            do {
-                // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
-                foreach ($indexedfiles as $indexedfile) {
-                    $fileid = $indexedfile->solr_fileid;
-
-                    if (isset($files[$fileid])) {
-                        // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
-                        // Filelib does not guarantee time modified is updated, so we will check important values.
-                        if ($indexedfile->modified != $files[$fileid]->get_timemodified()) {
-                            continue;
-                        }
-                        if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
-                            continue;
-                        }
-                        if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
-                            continue;
-                        }
-                        if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE &&
-                            $this->file_is_indexable($files[$fileid])) {
-                            // This means that the last time we indexed this file, filtering blocked it.
-                            // Current settings say it is indexable, so we will allow it to be indexed.
-                            continue;
-                        }
-
-                        // If the file is already indexed, we can just remove it from the files array and skip it.
-                        unset($files[$fileid]);
-                    } else {
-                        // This means we have found a file that is no longer attached, so we need to delete from the index.
-                        // We do it later, since this is progressive, and it could reorder results.
-                        $idstodelete[] = $indexedfile->id;
-                    }
-                }
-                $count += $rows;
+        $curl = $this->get_curl_object();
 
-                if ($count < $numfound) {
-                    // If we haven't hit the total count yet, fetch the next batch.
-                    list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
-                }
+        $url = $this->get_connection_url('/update/extract');
 
-            } while ($count < $numfound);
+        // Return results as XML.
+        $url->param('wt', 'xml');
 
-            // Delete files that are no longer attached.
-            foreach ($idstodelete as $id) {
-                // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
-                $this->get_search_client()->deleteById($id);
-            }
+        // This will prevent solr from automatically making fields for every tika output.
+        $url->param('uprefix', 'ignored_');
+
+        // Control how content is captured. This will keep our file content clean of non-important metadata.
+        $url->param('captureAttr', 'true');
+        // Move the content to a field for indexing.
+        $url->param('fmap.content', 'solr_filecontent');
+
+        // These are common fields that matches the standard *_point dynamic field and causes an error.
+        $url->param('fmap.media_white_point', 'ignored_mwp');
+        $url->param('fmap.media_black_point', 'ignored_mbp');
+
+        // Copy each key to the url with literal.
+        // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
+        foreach ($filedoc as $key => $value) {
+            // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
+            $url->param('fmap.' . $key, 'ignored_' . $key);
+            // Place data in a tmp field.
+            $url->param('literal.mdltmp_' . $key, $value);
+            // Then move to the final field.
+            $url->param('fmap.mdltmp_' . $key, $key);
         }
 
-        // Now we can actually index all the remaining files.
-        foreach ($files as $file) {
-            $this->add_stored_file($document, $file);
+        // This sets the true filename for Tika.
+        $url->param('resource.name', $storedfile->get_filename());
+
+        // A giant block of code that is really just error checking around the curl request.
+        try {
+            $requesturl = $url->out(false);
+            // We have to post the file directly in binary data (not using multipart) to avoid
+            // Solr bug SOLR-15039 which can cause incorrect data when you use multipart upload.
+            // Note this loads the whole file into memory; see limit in file_is_indexable().
+            $result = $curl->post($url->out(false), $storedfile->get_content());
+
+            $code = $curl->get_errno();
+            $info = $curl->get_info();
+
+            // Now error handling. It is just informational, since we aren't tracking per file/doc results.
+            if ($code != 0) {
+                // This means an internal cURL error occurred error is in result.
+                $message = 'Curl error ' . $code . ' while indexing file with document id ' . $filedoc['id'] . ': ' . $result . '.';
+                debugging($message, DEBUG_DEVELOPER);
+            } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
+                // Unexpected HTTP response code.
+                $message = 'Error while indexing file with document id ' . $filedoc['id'];
+                // Try to get error message out of msg or title if it exists.
+                if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
+                    $message .= ': ' . $matches[1];
+                } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
+                    $message .= ': ' . $matches[1];
+                }
+                // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
+                if (CLI_SCRIPT && !PHPUNIT_TEST) {
+                    mtrace($message);
+                    if (debugging()) {
+                        mtrace($requesturl);
+                    }
+                    // Suspiciion that this fails due to the file contents being PDFs.
+                }
+            } else {
+                // Check for the expected status field.
+                if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
+                    // Now check for the expected status of 0, if not, error.
+                    if ((int)$matches[1] !== 0) {
+                        $message = 'Unexpected Solr status code ' . (int)$matches[1];
+                        $message .= ' while indexing file with document id ' . $filedoc['id'] . '.';
+                        debugging($message, DEBUG_DEVELOPER);
+                    } else {
+                        // The document was successfully indexed.
+                        return;
+                    }
+                } else {
+                    // We received an unprocessable response.
+                    $message = 'Unexpected Solr response while indexing file with document id ' . $filedoc['id'] . ': ';
+                    $message .= strtok($result, "\n");
+                    debugging($message, DEBUG_DEVELOPER);
+                }
+            }
+        } catch (\Exception $e) {
+            // There was an error, but we are not tracking per-file success, so we just continue on.
+            debugging('Unknown exception while indexing file "' . $storedfile->get_filename() . '".', DEBUG_DEVELOPER);
         }
+
+        // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
+        $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
+        $this->add_solr_document($filedoc);
+
+
+        // It would have been nice to use the underlying solr code, but its too tightly integrated
+        // with talking to solr.
+        //return parent::add_stored_file($document, $storedfile);
     }
 }