- Added new dev dependencies for local smart modules in package.json …

…to facilitate integration testing. - Introduced a test_content.js script to generate dummy markdown files for testing SmartEntities with embeddings. - Created transformers.test.js to validate embedding functionality using the generated test content, ensuring proper integration with SmartSources and SmartBlocks. These changes improve the testing framework for SmartEntities, enabling better validation of embedding processes and integration with other smart modules.
brianpetro · Dec 22, 2024 · ce3a388 · ce3a388
1 parent f242a68
commit ce3a388
Show file tree

Hide file tree

Showing 3 changed files with 213 additions and 1 deletion.
diff --git a/smart-entities/package.json b/smart-entities/package.json
@@ -21,7 +21,12 @@
   },
   "homepage": "https://jsbrains.org",
   "devDependencies": {
-    "ava": "^6.0.1"
+    "ava": "^6.0.1",
+    "smart-blocks": "file:../smart-blocks",
+    "smart-environment": "file:../smart-environment",
+    "smart-fs": "file:../smart-fs",
+    "smart-settings": "file:../smart-settings",
+    "smart-sources": "file:../smart-sources"
   },
   "dependencies": {
     "smart-collections": "file:../smart-collections",

diff --git a/smart-entities/test/test_content.js b/smart-entities/test/test_content.js
@@ -0,0 +1,81 @@
+#!/usr/bin/env node
+
+/**
+ * @file test_content.js
+ * @description
+ * Creates some dummy text/markdown files in `smart-entities/test/test-content/`
+ * for integration tests of SmartEntities with embeddings.  
+ *
+ * Usage:
+ *   1) cd into `smart-entities/test/`
+ *   2) run: `node test_content.js`
+ *   3) The script will create `test-content/` subfolder with a few .md files.
+ *
+ * Then, in your test code (e.g., transformers.test.js), you can point the environment
+ * to `smart-entities/test/test-content/` to run embedding or entity operations.
+ */
+
+import fs from 'fs';
+import path from 'path';
+
+// The directory where we'll create test files
+const baseDir = path.join(process.cwd(), 'test/test-content');
+
+// Ensure the directory exists
+if (!fs.existsSync(baseDir)) {
+  fs.mkdirSync(baseDir, { recursive: true });
+}
+
+const filesData = [
+  {
+    name: 'entity_example_1.md',
+    content: `# Entity Example 1
+
+This is a short example file used to test SmartEntities with an embedding model.
+It doesn't have too much content, but enough to produce a vector.
+
+- Key topics: embedding, short text.
+`
+  },
+  {
+    name: 'entity_example_2.md',
+    content: `# Entity Example 2
+
+Another sample file with somewhat different text.
+We'll see if the embeddings place it near or far from Example 1.
+
+- Key topics: similarity, embedding, semantic distance.
+`
+  },
+  {
+    name: 'entity_irrelevant.md',
+    content: `# Entity Irrelevant
+
+Completely unrelated content focusing on quantum entanglement and local hidden variables.
+We expect this to embed quite differently from the 'embedding' or 'similarity' topics in the other examples.
+
+- Key topics: quantum, entanglement, hidden variables.
+`
+  },
+  {
+    name: 'entity_random.md',
+    content: `# Entity Random
+
+Jibberish lines:
+Rondolp hifer qwt opsidu. Alkpfe yoyz klmb?
+
+This should test random text handling in the embedding space.
+
+- Key topics: random, nonsense text.
+`
+  }
+];
+
+// Write each file
+filesData.forEach((fileObj) => {
+  const filePath = path.join(baseDir, fileObj.name);
+  fs.writeFileSync(filePath, fileObj.content, 'utf8');
+  console.log(`Created: ${fileObj.name}`);
+});
+
+console.log(`\nTest content has been created in: ${baseDir}\n`);
diff --git a/smart-entities/test/transformers.test.js b/smart-entities/test/transformers.test.js
@@ -0,0 +1,126 @@
+import test from 'ava';
+import path from 'path';
+import fs from 'fs';
+import { execSync } from 'child_process';
+
+// Import or reference your environment, sources, blocks, etc.:
+import { SmartEnv } from 'smart-environment';
+import { SmartFs } from 'smart-fs';
+import { NodeFsSmartFsAdapter } from 'smart-fs/adapters/node_fs.js';
+import { SmartSettings } from 'smart-settings';
+
+// We'll use SmartSources + SmartEntities to show an embedding test
+import { SmartSources, SmartSource } from 'smart-sources';
+import source_data_adapter from 'smart-sources/adapters/data/ajson_multi_file.js';
+import { MarkdownSourceContentAdapter } from 'smart-sources/adapters/markdown_source.js';
+import { SmartBlocks, SmartBlock } from 'smart-blocks';
+import block_data_adapter from 'smart-blocks/adapters/data/ajson_multi_file.js';
+import { MarkdownBlockContentAdapter } from 'smart-blocks/adapters/markdown_block.js';
+
+// For embedding we use a Transformers adapter as an example:
+import { SmartEmbedModel } from 'smart-embed-model';
+import { SmartEmbedTransformersAdapter } from 'smart-embed-model/adapters/transformers.js';
+
+test.before(async (t) => {
+  // 1) Ensure our test_content has been created
+  const contentDir = path.join(process.cwd(), 'test/test-content');
+  if (!fs.existsSync(contentDir)) {
+    // If not present, run the script
+    const scriptPath = path.join(process.cwd(), 'test/test_content.js');
+    if (fs.existsSync(scriptPath)) {
+      execSync(`node ${scriptPath}`);
+    } else {
+      throw new Error(`Missing test_content.js script at ${scriptPath}`);
+    }
+  }
+
+  // 2) Create an environment
+  t.context.env = await SmartEnv.create(
+    {
+      load_settings: () => ({}),
+      save_settings: () => {},
+      get settings() { return {}; },
+    },
+    {
+      env_path: contentDir,
+      modules: {
+        smart_fs: { class: SmartFs, adapter: NodeFsSmartFsAdapter },
+        smart_settings: { class: SmartSettings },
+        smart_embed_model: {
+          class: SmartEmbedModel,
+          adapters: {
+            transformers: SmartEmbedTransformersAdapter,
+          },
+        },
+      },
+      collections: {
+        // We'll attach a SmartSources collection
+        smart_sources: {
+          class: SmartSources,
+          data_adapter: source_data_adapter,
+          source_adapters: {
+            md: MarkdownSourceContentAdapter,
+          }
+        },
+        smart_blocks: {
+          class: SmartBlocks,
+          data_adapter: block_data_adapter,
+          block_adapters: {
+            md: MarkdownBlockContentAdapter,
+          }
+        },
+      },
+      item_types: {
+        SmartSource,
+        SmartBlock
+      },
+      default_settings: {
+        smart_sources: {
+          data_dir: 'multi',
+          embed_model: {
+            adapter: 'transformers',
+            transformers: {
+              model_key: 'TaylorAI/bge-micro-v2',  // or any local/huggingface model 
+              legacy_transformers: false,
+              gpu_batch_size: 2
+            },
+          },
+          // For demonstration, embed anything with >=10 chars
+          min_chars: 10,
+        }
+      }
+    }
+  );
+
+  // 3) Initialize the sources
+  await t.context.env.smart_sources.init_items();
+  await t.context.env.smart_sources.process_load_queue();
+
+  // 4) Import from actual markdown => parse => queue embed
+  await t.context.env.smart_sources.process_source_import_queue();
+  // 5) Save any newly created items
+  await t.context.env.smart_sources.process_save_queue();
+});
+
+test.after(async (t) => {
+  // optional: clean up the test-content folder
+  fs.rmSync(path.join(process.cwd(), 'test/test-content'), { recursive: true, force: true });
+});
+
+test.serial("Check that sources have embeddings via Transformers", async (t) => {
+  const { env } = t.context;
+  const sources = env.smart_sources;
+
+  // Process the embed queue for sources
+  await sources.process_embed_queue();
+
+  // All items with >10 chars in content should have a .vec
+  const embedded = Object.values(sources.items).filter(src => src.vec);
+  t.true(embedded.length > 0, 'At least one source has a vector');
+
+  // Print out the embedding sizes
+  embedded.forEach((src) => {
+    t.truthy(src.vec, `Source ${src.key} has a vector`);
+    console.log(`Source ${src.key} vector length: ${src.vec?.length}`);
+  });
+});