-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Added new dev dependencies for local smart modules in package.json …
…to facilitate integration testing. - Introduced a test_content.js script to generate dummy markdown files for testing SmartEntities with embeddings. - Created transformers.test.js to validate embedding functionality using the generated test content, ensuring proper integration with SmartSources and SmartBlocks. These changes improve the testing framework for SmartEntities, enabling better validation of embedding processes and integration with other smart modules.
- Loading branch information
Brian Joseph Petro
committed
Dec 22, 2024
1 parent
f242a68
commit ce3a388
Showing
3 changed files
with
213 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/usr/bin/env node | ||
|
||
/** | ||
* @file test_content.js | ||
* @description | ||
* Creates some dummy text/markdown files in `smart-entities/test/test-content/` | ||
* for integration tests of SmartEntities with embeddings. | ||
* | ||
* Usage: | ||
* 1) cd into `smart-entities/test/` | ||
* 2) run: `node test_content.js` | ||
* 3) The script will create `test-content/` subfolder with a few .md files. | ||
* | ||
* Then, in your test code (e.g., transformers.test.js), you can point the environment | ||
* to `smart-entities/test/test-content/` to run embedding or entity operations. | ||
*/ | ||
|
||
import fs from 'fs'; | ||
import path from 'path'; | ||
|
||
// The directory where we'll create test files | ||
const baseDir = path.join(process.cwd(), 'test/test-content'); | ||
|
||
// Ensure the directory exists | ||
if (!fs.existsSync(baseDir)) { | ||
fs.mkdirSync(baseDir, { recursive: true }); | ||
} | ||
|
||
const filesData = [ | ||
{ | ||
name: 'entity_example_1.md', | ||
content: `# Entity Example 1 | ||
This is a short example file used to test SmartEntities with an embedding model. | ||
It doesn't have too much content, but enough to produce a vector. | ||
- Key topics: embedding, short text. | ||
` | ||
}, | ||
{ | ||
name: 'entity_example_2.md', | ||
content: `# Entity Example 2 | ||
Another sample file with somewhat different text. | ||
We'll see if the embeddings place it near or far from Example 1. | ||
- Key topics: similarity, embedding, semantic distance. | ||
` | ||
}, | ||
{ | ||
name: 'entity_irrelevant.md', | ||
content: `# Entity Irrelevant | ||
Completely unrelated content focusing on quantum entanglement and local hidden variables. | ||
We expect this to embed quite differently from the 'embedding' or 'similarity' topics in the other examples. | ||
- Key topics: quantum, entanglement, hidden variables. | ||
` | ||
}, | ||
{ | ||
name: 'entity_random.md', | ||
content: `# Entity Random | ||
Jibberish lines: | ||
Rondolp hifer qwt opsidu. Alkpfe yoyz klmb? | ||
This should test random text handling in the embedding space. | ||
- Key topics: random, nonsense text. | ||
` | ||
} | ||
]; | ||
|
||
// Write each file | ||
filesData.forEach((fileObj) => { | ||
const filePath = path.join(baseDir, fileObj.name); | ||
fs.writeFileSync(filePath, fileObj.content, 'utf8'); | ||
console.log(`Created: ${fileObj.name}`); | ||
}); | ||
|
||
console.log(`\nTest content has been created in: ${baseDir}\n`); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import test from 'ava'; | ||
import path from 'path'; | ||
import fs from 'fs'; | ||
import { execSync } from 'child_process'; | ||
|
||
// Import or reference your environment, sources, blocks, etc.: | ||
import { SmartEnv } from 'smart-environment'; | ||
import { SmartFs } from 'smart-fs'; | ||
import { NodeFsSmartFsAdapter } from 'smart-fs/adapters/node_fs.js'; | ||
import { SmartSettings } from 'smart-settings'; | ||
|
||
// We'll use SmartSources + SmartEntities to show an embedding test | ||
import { SmartSources, SmartSource } from 'smart-sources'; | ||
import source_data_adapter from 'smart-sources/adapters/data/ajson_multi_file.js'; | ||
import { MarkdownSourceContentAdapter } from 'smart-sources/adapters/markdown_source.js'; | ||
import { SmartBlocks, SmartBlock } from 'smart-blocks'; | ||
import block_data_adapter from 'smart-blocks/adapters/data/ajson_multi_file.js'; | ||
import { MarkdownBlockContentAdapter } from 'smart-blocks/adapters/markdown_block.js'; | ||
|
||
// For embedding we use a Transformers adapter as an example: | ||
import { SmartEmbedModel } from 'smart-embed-model'; | ||
import { SmartEmbedTransformersAdapter } from 'smart-embed-model/adapters/transformers.js'; | ||
|
||
test.before(async (t) => { | ||
// 1) Ensure our test_content has been created | ||
const contentDir = path.join(process.cwd(), 'test/test-content'); | ||
if (!fs.existsSync(contentDir)) { | ||
// If not present, run the script | ||
const scriptPath = path.join(process.cwd(), 'test/test_content.js'); | ||
if (fs.existsSync(scriptPath)) { | ||
execSync(`node ${scriptPath}`); | ||
} else { | ||
throw new Error(`Missing test_content.js script at ${scriptPath}`); | ||
} | ||
} | ||
|
||
// 2) Create an environment | ||
t.context.env = await SmartEnv.create( | ||
{ | ||
load_settings: () => ({}), | ||
save_settings: () => {}, | ||
get settings() { return {}; }, | ||
}, | ||
{ | ||
env_path: contentDir, | ||
modules: { | ||
smart_fs: { class: SmartFs, adapter: NodeFsSmartFsAdapter }, | ||
smart_settings: { class: SmartSettings }, | ||
smart_embed_model: { | ||
class: SmartEmbedModel, | ||
adapters: { | ||
transformers: SmartEmbedTransformersAdapter, | ||
}, | ||
}, | ||
}, | ||
collections: { | ||
// We'll attach a SmartSources collection | ||
smart_sources: { | ||
class: SmartSources, | ||
data_adapter: source_data_adapter, | ||
source_adapters: { | ||
md: MarkdownSourceContentAdapter, | ||
} | ||
}, | ||
smart_blocks: { | ||
class: SmartBlocks, | ||
data_adapter: block_data_adapter, | ||
block_adapters: { | ||
md: MarkdownBlockContentAdapter, | ||
} | ||
}, | ||
}, | ||
item_types: { | ||
SmartSource, | ||
SmartBlock | ||
}, | ||
default_settings: { | ||
smart_sources: { | ||
data_dir: 'multi', | ||
embed_model: { | ||
adapter: 'transformers', | ||
transformers: { | ||
model_key: 'TaylorAI/bge-micro-v2', // or any local/huggingface model | ||
legacy_transformers: false, | ||
gpu_batch_size: 2 | ||
}, | ||
}, | ||
// For demonstration, embed anything with >=10 chars | ||
min_chars: 10, | ||
} | ||
} | ||
} | ||
); | ||
|
||
// 3) Initialize the sources | ||
await t.context.env.smart_sources.init_items(); | ||
await t.context.env.smart_sources.process_load_queue(); | ||
|
||
// 4) Import from actual markdown => parse => queue embed | ||
await t.context.env.smart_sources.process_source_import_queue(); | ||
// 5) Save any newly created items | ||
await t.context.env.smart_sources.process_save_queue(); | ||
}); | ||
|
||
test.after(async (t) => { | ||
// optional: clean up the test-content folder | ||
fs.rmSync(path.join(process.cwd(), 'test/test-content'), { recursive: true, force: true }); | ||
}); | ||
|
||
test.serial("Check that sources have embeddings via Transformers", async (t) => { | ||
const { env } = t.context; | ||
const sources = env.smart_sources; | ||
|
||
// Process the embed queue for sources | ||
await sources.process_embed_queue(); | ||
|
||
// All items with >10 chars in content should have a .vec | ||
const embedded = Object.values(sources.items).filter(src => src.vec); | ||
t.true(embedded.length > 0, 'At least one source has a vector'); | ||
|
||
// Print out the embedding sizes | ||
embedded.forEach((src) => { | ||
t.truthy(src.vec, `Source ${src.key} has a vector`); | ||
console.log(`Source ${src.key} vector length: ${src.vec?.length}`); | ||
}); | ||
}); |