embedding logic efficiency and clean-up

brianpetro · Feb 11, 2023 · 32100c5 · 32100c5
1 parent 7acd7cc
commit 32100c5
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 121 deletions.
diff --git a/main.js b/main.js
@@ -572,15 +572,13 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
     /**
      * BEGIN Block "section" embedding
      */
-    let has_blocks = false;
     // get file contents
     const note_contents = await this.app.vault.cachedRead(curr_file);
     let processed_since_last_save = 0;
     const note_sections = this.block_parser(note_contents, curr_file.path);
     // console.log(note_sections);
     // if note has more than one section (if only one then its same as full-content)
     if(note_sections.length > 1) {
-      has_blocks = true;
       // for each section in file
       //console.log("Sections: " + note_sections.length);
       for (let j = 0; j < note_sections.length; j++) {
@@ -592,6 +590,12 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
         blocks.push(block_key);
         let block_hash; // set hash of block_embed_input in correct scope
         if (this.embeddings[block_key] && this.embeddings[block_key].meta) {
+          // skip if length of block_embed_input same as length of embeddings[block_key].meta.len
+          if (block_embed_input.length === this.embeddings[block_key].meta.len) {
+            // log skipping file
+            // console.log("skipping block (len)");
+            continue;
+          }
           // add hash to blocks to prevent empty blocks triggering full-file embedding
           // skip if embeddings key already exists and block mtime is greater than or equal to file mtime
           if (this.embeddings[block_key].meta.mtime >= curr_file.stat.mtime) {
@@ -607,36 +611,16 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
             continue;
           }
         }
-        // get embeddings for block 
-        // add block_embeddings to embeddings
-        // batch_promises.push(this.get_embeddings(block_key, block_embed_input, {
-        //   mtime: curr_file.stat.mtime, 
-        //   hash: block_hash, 
-        //   file: curr_file_key,
-        //   path: note_sections[j].path,
-        // }));
-        // if(batch_promises.length > 4) {
-        //   await Promise.all(batch_promises);
-        //   processed_since_last_save += batch_promises.length;
-        //   // log embedding
-        //   // console.log("embedding: " + curr_file.path);
-        //   if (processed_since_last_save >= 30) {
-        //     // write embeddings JSON to file
-        //     await this.save_embeddings_to_file();
-        //     // reset processed_since_last_save
-        //     processed_since_last_save = 0;
-        //   }
-        //   // reset batch_promises
-        //   batch_promises = [];
-        // }
 
         // create req_batch for batching requests
         req_batch.push([block_key, block_embed_input, {
-          mtime: curr_file.stat.mtime, 
+          // oldmtime: curr_file.stat.mtime, 
+          // get current datetime as unix timestamp
+          mtime: Date.now(),
           hash: block_hash, 
           file: curr_file_key,
           path: note_sections[j].path,
-          len: note_sections[j].length,
+          len: block_embed_input.length,
         }]);
         if(req_batch.length > 9) {
           // add batch to batch_promises
@@ -666,122 +650,94 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
     /**
      * BEGIN File "full note" embedding
      */
-    let skip = false;
-    // get current note file size
-    const curr_file_size = curr_file.stat.size;
-    // get file size from this.embeddings
-    let prev_file_size = 0;
-    if (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta && this.embeddings[curr_file_key].meta.size) {
-      prev_file_size = this.embeddings[curr_file_key].meta.size;
-      // if curr file size is less than 10% different from prev file size
-      const file_delta_pct = Math.round((Math.abs(curr_file_size - prev_file_size) / curr_file_size) * 100);
-      if(file_delta_pct < 10) {
-        // skip embedding
-        // console.log("skipping file (size) " + curr_file.path);
-        this.render_log.skipped_low_delta[curr_file.name] = file_delta_pct + "%";
-        skip = true;
-      }
-    }
 
     // if file length is less than ~8000 tokens use full file contents
     // else if file length is greater than 8000 tokens build file_embed_input from file headings
     file_embed_input += `:\n`;
     /**
      * TODO: improve/refactor the following "large file reduce to headings" logic
      */
-    if(!skip){
-      if(note_contents.length < MAX_EMBED_STRING_LENGTH) {
-        file_embed_input += note_contents
-      }else{ 
-        const note_meta_cache = this.app.metadataCache.getFileCache(curr_file);
-        // for each heading in file
-        if(typeof note_meta_cache.headings === "undefined") {
-          // console.log("no headings found, using first chunk of file instead");
-          file_embed_input += note_contents.substring(0, MAX_EMBED_STRING_LENGTH);
-          // console.log("chuck len: " + file_embed_input.length);
-        }else{
-          let note_headings = "";
-          for (let j = 0; j < note_meta_cache.headings.length; j++) {
-            // get heading level
-            const heading_level = note_meta_cache.headings[j].level;
-            // get heading text
-            const heading_text = note_meta_cache.headings[j].heading;
-            // build markdown heading
-            let md_heading = "";
-            for (let k = 0; k < heading_level; k++) {
-              md_heading += "#";
-            }
-            // add heading to note_headings
-            note_headings += `${md_heading} ${heading_text}\n`;
-          }
-          //console.log(note_headings);
-          file_embed_input += note_headings
-          if(file_embed_input.length > MAX_EMBED_STRING_LENGTH) {
-            file_embed_input = file_embed_input.substring(0, MAX_EMBED_STRING_LENGTH);
+    if(note_contents.length < MAX_EMBED_STRING_LENGTH) {
+      file_embed_input += note_contents
+    }else{ 
+      const note_meta_cache = this.app.metadataCache.getFileCache(curr_file);
+      // for each heading in file
+      if(typeof note_meta_cache.headings === "undefined") {
+        // console.log("no headings found, using first chunk of file instead");
+        file_embed_input += note_contents.substring(0, MAX_EMBED_STRING_LENGTH);
+        // console.log("chuck len: " + file_embed_input.length);
+      }else{
+        let note_headings = "";
+        for (let j = 0; j < note_meta_cache.headings.length; j++) {
+          // get heading level
+          const heading_level = note_meta_cache.headings[j].level;
+          // get heading text
+          const heading_text = note_meta_cache.headings[j].heading;
+          // build markdown heading
+          let md_heading = "";
+          for (let k = 0; k < heading_level; k++) {
+            md_heading += "#";
           }
+          // add heading to note_headings
+          note_headings += `${md_heading} ${heading_text}\n`;
+        }
+        //console.log(note_headings);
+        file_embed_input += note_headings
+        if(file_embed_input.length > MAX_EMBED_STRING_LENGTH) {
+          file_embed_input = file_embed_input.substring(0, MAX_EMBED_STRING_LENGTH);
         }
       }
     }
-
     // skip embedding full file if blocks is not empty and all hashes are present in this.embeddings
     // better than hashing file_embed_input because more resilient to inconsequential changes (whitespace between headings)
-    let file_hash = this.get_embed_hash(file_embed_input);
+    const file_hash = this.get_embed_hash(file_embed_input);
     const existing_hash = (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta) ? this.embeddings[curr_file_key].meta.hash : null;
-    if(!skip && existing_hash) {
-      if(file_hash === existing_hash) {
-        skip = true;
-      }
+    if(existing_hash && (file_hash === existing_hash)) {
+      // console.log("skipping file (hash): " + curr_file.path);
+      this.update_render_log(blocks, file_embed_input);
+      return;
     };
+
     // if not already skipping and blocks are present
     const existing_blocks = (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta) ? this.embeddings[curr_file_key].meta.blocks : null;
-    if(!skip && existing_blocks && has_blocks && Array.isArray(existing_blocks) && (blocks.length > 0)) {
-      // if blocks is equal to existing_blocks
-      if(blocks.length === existing_blocks.length) {
-        skip = true;
-        for(let j = 0; j < blocks.length; j++) {
-          // triggers re-embedding if blocks were re-ordered
-          if(blocks[j] !== existing_blocks[j]) {
-            skip = false;
-            break;
-          }
+    let existing_has_all_blocks = true;
+    if(existing_blocks && Array.isArray(existing_blocks) && (blocks.length > 0)) {
+      // if all blocks are in existing_blocks then skip (allows deletion of small blocks without triggering full file embedding)
+      for (let j = 0; j < blocks.length; j++) {
+        if(existing_blocks.indexOf(blocks[j]) === -1) {
+          existing_has_all_blocks = false;
+          break;
         }
       }
     }
-    // skip if skip is true
-    if(!skip) {
-      let meta = {
-        mtime: curr_file.stat.mtime,
-        hash: file_hash,
-        path: curr_file.path,
-        size: curr_file.stat.size,
-      };
-      if(has_blocks && (blocks.length > 0)) {
-        meta.blocks = blocks;
-      }
-      // batch_promises.push(this.get_embeddings(curr_file_key, file_embed_input, meta));
-      req_batch.push([curr_file_key, file_embed_input, meta]);
-    }else{
-      if(has_blocks && (blocks.length > 0)) {
-        // multiply by 2 because implies we saved token spending on blocks(sections), too
-        this.render_log.tokens_saved_by_cache += file_embed_input.length/2;
-      }else{
-        // calc tokens saved by cache: divide by 4 for token estimate
-        this.render_log.tokens_saved_by_cache += file_embed_input.length/4;
+    // if existing has all blocks then check file size for delta
+    if(existing_has_all_blocks){
+      // get current note file size
+      const curr_file_size = curr_file.stat.size;
+      // get file size from this.embeddings
+      let prev_file_size = 0;
+      if (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta && this.embeddings[curr_file_key].meta.size) {
+        prev_file_size = this.embeddings[curr_file_key].meta.size;
+        // if curr file size is less than 10% different from prev file size
+        const file_delta_pct = Math.round((Math.abs(curr_file_size - prev_file_size) / curr_file_size) * 100);
+        if(file_delta_pct < 10) {
+          // skip embedding
+          // console.log("skipping file (size) " + curr_file.path);
+          this.render_log.skipped_low_delta[curr_file.name] = file_delta_pct + "%";
+          this.update_render_log(blocks, file_embed_input);
+          return;
+        }
       }
-      // log skipping file
-      // console.log("skipping cached file");
-    }
-    // if batch_promises is empty then return
-    // if(batch_promises.length === 0) {
-    //   return;
-    // }
-
-    // wait for all promises to resolve
-    // await Promise.all(batch_promises);
-
-    if(req_batch.length === 0) {
-      return;
     }
+    let meta = {
+      mtime: curr_file.stat.mtime,
+      hash: file_hash,
+      path: curr_file.path,
+      size: curr_file.stat.size,
+      blocks: blocks,
+    };
+    // batch_promises.push(this.get_embeddings(curr_file_key, file_embed_input, meta));
+    req_batch.push([curr_file_key, file_embed_input, meta]);
     // send batch request
     await this.get_embeddings_batch(req_batch);
 
@@ -791,6 +747,16 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
       // write embeddings JSON to file
       await this.save_embeddings_to_file();
     }
+
+  }
+  update_render_log(blocks, file_embed_input) {
+    if (blocks.length > 0) {
+      // multiply by 2 because implies we saved token spending on blocks(sections), too
+      this.render_log.tokens_saved_by_cache += file_embed_input.length / 2;
+    } else {
+      // calc tokens saved by cache: divide by 4 for token estimate
+      this.render_log.tokens_saved_by_cache += file_embed_input.length / 4;
+    }
   }
 
   async get_embeddings(key, embed_input, meta={}) {

diff --git a/manifest.json b/manifest.json
@@ -3,7 +3,7 @@
   "name": "Smart Connections",
   "author": "Brian Petro",
   "description": "Find links to similar notes using artificial intelligence from OpenAI.",
-  "version": "1.1.15",
+  "version": "1.1.16",
   "minAppVersion": "1.1.0",
   "authorUrl": "https://wfhbrian.com",
   "isDesktopOnly": true