Skip to content

Commit

Permalink
embedding logic efficiency and clean-up
Browse files Browse the repository at this point in the history
  • Loading branch information
brianpetro committed Feb 11, 2023
1 parent 7acd7cc commit 32100c5
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 121 deletions.
206 changes: 86 additions & 120 deletions main.js
Original file line number Diff line number Diff line change
Expand Up @@ -572,15 +572,13 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
/**
* BEGIN Block "section" embedding
*/
let has_blocks = false;
// get file contents
const note_contents = await this.app.vault.cachedRead(curr_file);
let processed_since_last_save = 0;
const note_sections = this.block_parser(note_contents, curr_file.path);
// console.log(note_sections);
// if note has more than one section (if only one then its same as full-content)
if(note_sections.length > 1) {
has_blocks = true;
// for each section in file
//console.log("Sections: " + note_sections.length);
for (let j = 0; j < note_sections.length; j++) {
Expand All @@ -592,6 +590,12 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
blocks.push(block_key);
let block_hash; // set hash of block_embed_input in correct scope
if (this.embeddings[block_key] && this.embeddings[block_key].meta) {
// skip if length of block_embed_input same as length of embeddings[block_key].meta.len
if (block_embed_input.length === this.embeddings[block_key].meta.len) {
// log skipping file
// console.log("skipping block (len)");
continue;
}
// add hash to blocks to prevent empty blocks triggering full-file embedding
// skip if embeddings key already exists and block mtime is greater than or equal to file mtime
if (this.embeddings[block_key].meta.mtime >= curr_file.stat.mtime) {
Expand All @@ -607,36 +611,16 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
continue;
}
}
// get embeddings for block
// add block_embeddings to embeddings
// batch_promises.push(this.get_embeddings(block_key, block_embed_input, {
// mtime: curr_file.stat.mtime,
// hash: block_hash,
// file: curr_file_key,
// path: note_sections[j].path,
// }));
// if(batch_promises.length > 4) {
// await Promise.all(batch_promises);
// processed_since_last_save += batch_promises.length;
// // log embedding
// // console.log("embedding: " + curr_file.path);
// if (processed_since_last_save >= 30) {
// // write embeddings JSON to file
// await this.save_embeddings_to_file();
// // reset processed_since_last_save
// processed_since_last_save = 0;
// }
// // reset batch_promises
// batch_promises = [];
// }

// create req_batch for batching requests
req_batch.push([block_key, block_embed_input, {
mtime: curr_file.stat.mtime,
// oldmtime: curr_file.stat.mtime,
// get current datetime as unix timestamp
mtime: Date.now(),
hash: block_hash,
file: curr_file_key,
path: note_sections[j].path,
len: note_sections[j].length,
len: block_embed_input.length,
}]);
if(req_batch.length > 9) {
// add batch to batch_promises
Expand Down Expand Up @@ -666,122 +650,94 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
/**
* BEGIN File "full note" embedding
*/
let skip = false;
// get current note file size
const curr_file_size = curr_file.stat.size;
// get file size from this.embeddings
let prev_file_size = 0;
if (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta && this.embeddings[curr_file_key].meta.size) {
prev_file_size = this.embeddings[curr_file_key].meta.size;
// if curr file size is less than 10% different from prev file size
const file_delta_pct = Math.round((Math.abs(curr_file_size - prev_file_size) / curr_file_size) * 100);
if(file_delta_pct < 10) {
// skip embedding
// console.log("skipping file (size) " + curr_file.path);
this.render_log.skipped_low_delta[curr_file.name] = file_delta_pct + "%";
skip = true;
}
}

// if file length is less than ~8000 tokens use full file contents
// else if file length is greater than 8000 tokens build file_embed_input from file headings
file_embed_input += `:\n`;
/**
* TODO: improve/refactor the following "large file reduce to headings" logic
*/
if(!skip){
if(note_contents.length < MAX_EMBED_STRING_LENGTH) {
file_embed_input += note_contents
}else{
const note_meta_cache = this.app.metadataCache.getFileCache(curr_file);
// for each heading in file
if(typeof note_meta_cache.headings === "undefined") {
// console.log("no headings found, using first chunk of file instead");
file_embed_input += note_contents.substring(0, MAX_EMBED_STRING_LENGTH);
// console.log("chuck len: " + file_embed_input.length);
}else{
let note_headings = "";
for (let j = 0; j < note_meta_cache.headings.length; j++) {
// get heading level
const heading_level = note_meta_cache.headings[j].level;
// get heading text
const heading_text = note_meta_cache.headings[j].heading;
// build markdown heading
let md_heading = "";
for (let k = 0; k < heading_level; k++) {
md_heading += "#";
}
// add heading to note_headings
note_headings += `${md_heading} ${heading_text}\n`;
}
//console.log(note_headings);
file_embed_input += note_headings
if(file_embed_input.length > MAX_EMBED_STRING_LENGTH) {
file_embed_input = file_embed_input.substring(0, MAX_EMBED_STRING_LENGTH);
if(note_contents.length < MAX_EMBED_STRING_LENGTH) {
file_embed_input += note_contents
}else{
const note_meta_cache = this.app.metadataCache.getFileCache(curr_file);
// for each heading in file
if(typeof note_meta_cache.headings === "undefined") {
// console.log("no headings found, using first chunk of file instead");
file_embed_input += note_contents.substring(0, MAX_EMBED_STRING_LENGTH);
// console.log("chuck len: " + file_embed_input.length);
}else{
let note_headings = "";
for (let j = 0; j < note_meta_cache.headings.length; j++) {
// get heading level
const heading_level = note_meta_cache.headings[j].level;
// get heading text
const heading_text = note_meta_cache.headings[j].heading;
// build markdown heading
let md_heading = "";
for (let k = 0; k < heading_level; k++) {
md_heading += "#";
}
// add heading to note_headings
note_headings += `${md_heading} ${heading_text}\n`;
}
//console.log(note_headings);
file_embed_input += note_headings
if(file_embed_input.length > MAX_EMBED_STRING_LENGTH) {
file_embed_input = file_embed_input.substring(0, MAX_EMBED_STRING_LENGTH);
}
}
}

// skip embedding full file if blocks is not empty and all hashes are present in this.embeddings
// better than hashing file_embed_input because more resilient to inconsequential changes (whitespace between headings)
let file_hash = this.get_embed_hash(file_embed_input);
const file_hash = this.get_embed_hash(file_embed_input);
const existing_hash = (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta) ? this.embeddings[curr_file_key].meta.hash : null;
if(!skip && existing_hash) {
if(file_hash === existing_hash) {
skip = true;
}
if(existing_hash && (file_hash === existing_hash)) {
// console.log("skipping file (hash): " + curr_file.path);
this.update_render_log(blocks, file_embed_input);
return;
};

// if not already skipping and blocks are present
const existing_blocks = (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta) ? this.embeddings[curr_file_key].meta.blocks : null;
if(!skip && existing_blocks && has_blocks && Array.isArray(existing_blocks) && (blocks.length > 0)) {
// if blocks is equal to existing_blocks
if(blocks.length === existing_blocks.length) {
skip = true;
for(let j = 0; j < blocks.length; j++) {
// triggers re-embedding if blocks were re-ordered
if(blocks[j] !== existing_blocks[j]) {
skip = false;
break;
}
let existing_has_all_blocks = true;
if(existing_blocks && Array.isArray(existing_blocks) && (blocks.length > 0)) {
// if all blocks are in existing_blocks then skip (allows deletion of small blocks without triggering full file embedding)
for (let j = 0; j < blocks.length; j++) {
if(existing_blocks.indexOf(blocks[j]) === -1) {
existing_has_all_blocks = false;
break;
}
}
}
// skip if skip is true
if(!skip) {
let meta = {
mtime: curr_file.stat.mtime,
hash: file_hash,
path: curr_file.path,
size: curr_file.stat.size,
};
if(has_blocks && (blocks.length > 0)) {
meta.blocks = blocks;
}
// batch_promises.push(this.get_embeddings(curr_file_key, file_embed_input, meta));
req_batch.push([curr_file_key, file_embed_input, meta]);
}else{
if(has_blocks && (blocks.length > 0)) {
// multiply by 2 because implies we saved token spending on blocks(sections), too
this.render_log.tokens_saved_by_cache += file_embed_input.length/2;
}else{
// calc tokens saved by cache: divide by 4 for token estimate
this.render_log.tokens_saved_by_cache += file_embed_input.length/4;
// if existing has all blocks then check file size for delta
if(existing_has_all_blocks){
// get current note file size
const curr_file_size = curr_file.stat.size;
// get file size from this.embeddings
let prev_file_size = 0;
if (this.embeddings[curr_file_key] && this.embeddings[curr_file_key].meta && this.embeddings[curr_file_key].meta.size) {
prev_file_size = this.embeddings[curr_file_key].meta.size;
// if curr file size is less than 10% different from prev file size
const file_delta_pct = Math.round((Math.abs(curr_file_size - prev_file_size) / curr_file_size) * 100);
if(file_delta_pct < 10) {
// skip embedding
// console.log("skipping file (size) " + curr_file.path);
this.render_log.skipped_low_delta[curr_file.name] = file_delta_pct + "%";
this.update_render_log(blocks, file_embed_input);
return;
}
}
// log skipping file
// console.log("skipping cached file");
}
// if batch_promises is empty then return
// if(batch_promises.length === 0) {
// return;
// }

// wait for all promises to resolve
// await Promise.all(batch_promises);

if(req_batch.length === 0) {
return;
}
let meta = {
mtime: curr_file.stat.mtime,
hash: file_hash,
path: curr_file.path,
size: curr_file.stat.size,
blocks: blocks,
};
// batch_promises.push(this.get_embeddings(curr_file_key, file_embed_input, meta));
req_batch.push([curr_file_key, file_embed_input, meta]);
// send batch request
await this.get_embeddings_batch(req_batch);

Expand All @@ -791,6 +747,16 @@ class SmartConnectionsPlugin extends Obsidian.Plugin {
// write embeddings JSON to file
await this.save_embeddings_to_file();
}

}
update_render_log(blocks, file_embed_input) {
if (blocks.length > 0) {
// multiply by 2 because implies we saved token spending on blocks(sections), too
this.render_log.tokens_saved_by_cache += file_embed_input.length / 2;
} else {
// calc tokens saved by cache: divide by 4 for token estimate
this.render_log.tokens_saved_by_cache += file_embed_input.length / 4;
}
}

async get_embeddings(key, embed_input, meta={}) {
Expand Down
2 changes: 1 addition & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"name": "Smart Connections",
"author": "Brian Petro",
"description": "Find links to similar notes using artificial intelligence from OpenAI.",
"version": "1.1.15",
"version": "1.1.16",
"minAppVersion": "1.1.0",
"authorUrl": "https://wfhbrian.com",
"isDesktopOnly": true
Expand Down

0 comments on commit 32100c5

Please sign in to comment.