Skip to content

Commit

Permalink
- Added new dev dependencies for local smart modules in package.json …
Browse files Browse the repository at this point in the history
…to facilitate integration testing.

- Introduced a test_content.js script to generate dummy markdown files for testing SmartEntities with embeddings.
- Created transformers.test.js to validate embedding functionality using the generated test content, ensuring proper integration with SmartSources and SmartBlocks.

These changes improve the testing framework for SmartEntities, enabling better validation of embedding processes and integration with other smart modules.
  • Loading branch information
Brian Joseph Petro committed Dec 22, 2024
1 parent f242a68 commit ce3a388
Show file tree
Hide file tree
Showing 3 changed files with 213 additions and 1 deletion.
7 changes: 6 additions & 1 deletion smart-entities/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@
},
"homepage": "https://jsbrains.org",
"devDependencies": {
"ava": "^6.0.1"
"ava": "^6.0.1",
"smart-blocks": "file:../smart-blocks",
"smart-environment": "file:../smart-environment",
"smart-fs": "file:../smart-fs",
"smart-settings": "file:../smart-settings",
"smart-sources": "file:../smart-sources"
},
"dependencies": {
"smart-collections": "file:../smart-collections",
Expand Down
81 changes: 81 additions & 0 deletions smart-entities/test/test_content.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env node

/**
* @file test_content.js
* @description
* Creates some dummy text/markdown files in `smart-entities/test/test-content/`
* for integration tests of SmartEntities with embeddings.
*
* Usage:
* 1) cd into `smart-entities/test/`
* 2) run: `node test_content.js`
* 3) The script will create `test-content/` subfolder with a few .md files.
*
* Then, in your test code (e.g., transformers.test.js), you can point the environment
* to `smart-entities/test/test-content/` to run embedding or entity operations.
*/

import fs from 'fs';
import path from 'path';

// The directory where we'll create test files
const baseDir = path.join(process.cwd(), 'test/test-content');

// Ensure the directory exists
if (!fs.existsSync(baseDir)) {
fs.mkdirSync(baseDir, { recursive: true });
}

const filesData = [
{
name: 'entity_example_1.md',
content: `# Entity Example 1
This is a short example file used to test SmartEntities with an embedding model.
It doesn't have too much content, but enough to produce a vector.
- Key topics: embedding, short text.
`
},
{
name: 'entity_example_2.md',
content: `# Entity Example 2
Another sample file with somewhat different text.
We'll see if the embeddings place it near or far from Example 1.
- Key topics: similarity, embedding, semantic distance.
`
},
{
name: 'entity_irrelevant.md',
content: `# Entity Irrelevant
Completely unrelated content focusing on quantum entanglement and local hidden variables.
We expect this to embed quite differently from the 'embedding' or 'similarity' topics in the other examples.
- Key topics: quantum, entanglement, hidden variables.
`
},
{
name: 'entity_random.md',
content: `# Entity Random
Jibberish lines:
Rondolp hifer qwt opsidu. Alkpfe yoyz klmb?
This should test random text handling in the embedding space.
- Key topics: random, nonsense text.
`
}
];

// Write each file
filesData.forEach((fileObj) => {
const filePath = path.join(baseDir, fileObj.name);
fs.writeFileSync(filePath, fileObj.content, 'utf8');
console.log(`Created: ${fileObj.name}`);
});

console.log(`\nTest content has been created in: ${baseDir}\n`);
126 changes: 126 additions & 0 deletions smart-entities/test/transformers.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import test from 'ava';
import path from 'path';
import fs from 'fs';
import { execSync } from 'child_process';

// Import or reference your environment, sources, blocks, etc.:
import { SmartEnv } from 'smart-environment';
import { SmartFs } from 'smart-fs';
import { NodeFsSmartFsAdapter } from 'smart-fs/adapters/node_fs.js';
import { SmartSettings } from 'smart-settings';

// We'll use SmartSources + SmartEntities to show an embedding test
import { SmartSources, SmartSource } from 'smart-sources';
import source_data_adapter from 'smart-sources/adapters/data/ajson_multi_file.js';
import { MarkdownSourceContentAdapter } from 'smart-sources/adapters/markdown_source.js';
import { SmartBlocks, SmartBlock } from 'smart-blocks';
import block_data_adapter from 'smart-blocks/adapters/data/ajson_multi_file.js';
import { MarkdownBlockContentAdapter } from 'smart-blocks/adapters/markdown_block.js';

// For embedding we use a Transformers adapter as an example:
import { SmartEmbedModel } from 'smart-embed-model';
import { SmartEmbedTransformersAdapter } from 'smart-embed-model/adapters/transformers.js';

test.before(async (t) => {
// 1) Ensure our test_content has been created
const contentDir = path.join(process.cwd(), 'test/test-content');
if (!fs.existsSync(contentDir)) {
// If not present, run the script
const scriptPath = path.join(process.cwd(), 'test/test_content.js');
if (fs.existsSync(scriptPath)) {
execSync(`node ${scriptPath}`);
} else {
throw new Error(`Missing test_content.js script at ${scriptPath}`);
}
}

// 2) Create an environment
t.context.env = await SmartEnv.create(
{
load_settings: () => ({}),
save_settings: () => {},
get settings() { return {}; },
},
{
env_path: contentDir,
modules: {
smart_fs: { class: SmartFs, adapter: NodeFsSmartFsAdapter },
smart_settings: { class: SmartSettings },
smart_embed_model: {
class: SmartEmbedModel,
adapters: {
transformers: SmartEmbedTransformersAdapter,
},
},
},
collections: {
// We'll attach a SmartSources collection
smart_sources: {
class: SmartSources,
data_adapter: source_data_adapter,
source_adapters: {
md: MarkdownSourceContentAdapter,
}
},
smart_blocks: {
class: SmartBlocks,
data_adapter: block_data_adapter,
block_adapters: {
md: MarkdownBlockContentAdapter,
}
},
},
item_types: {
SmartSource,
SmartBlock
},
default_settings: {
smart_sources: {
data_dir: 'multi',
embed_model: {
adapter: 'transformers',
transformers: {
model_key: 'TaylorAI/bge-micro-v2', // or any local/huggingface model
legacy_transformers: false,
gpu_batch_size: 2
},
},
// For demonstration, embed anything with >=10 chars
min_chars: 10,
}
}
}
);

// 3) Initialize the sources
await t.context.env.smart_sources.init_items();
await t.context.env.smart_sources.process_load_queue();

// 4) Import from actual markdown => parse => queue embed
await t.context.env.smart_sources.process_source_import_queue();
// 5) Save any newly created items
await t.context.env.smart_sources.process_save_queue();
});

test.after(async (t) => {
// optional: clean up the test-content folder
fs.rmSync(path.join(process.cwd(), 'test/test-content'), { recursive: true, force: true });
});

test.serial("Check that sources have embeddings via Transformers", async (t) => {
const { env } = t.context;
const sources = env.smart_sources;

// Process the embed queue for sources
await sources.process_embed_queue();

// All items with >10 chars in content should have a .vec
const embedded = Object.values(sources.items).filter(src => src.vec);
t.true(embedded.length > 0, 'At least one source has a vector');

// Print out the embedding sizes
embedded.forEach((src) => {
t.truthy(src.vec, `Source ${src.key} has a vector`);
console.log(`Source ${src.key} vector length: ${src.vec?.length}`);
});
});

0 comments on commit ce3a388

Please sign in to comment.