Skip to content

Commit

Permalink
Merge pull request #12 from aurelio-labs/simonas/async
Browse files Browse the repository at this point in the history
feat: support async
  • Loading branch information
jamescalam authored Jul 3, 2024
2 parents c67b860 + d3d4b16 commit 67b3524
Show file tree
Hide file tree
Showing 10 changed files with 1,316 additions and 448 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__pycache__
*.pyc
.venv
.venv*
.DS_Store
venv/
/.vscode
Expand Down
1,322 changes: 903 additions & 419 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "semantic-chunkers"
version = "0.0.6"
version = "0.0.7"
description = "Super advanced chunking methods for AI"
authors = ["Aurelio AI <[email protected]>"]
readme = "README.md"
Expand All @@ -16,7 +16,7 @@ regex = "^2023.12.25"
tiktoken = ">=0.7.0,<1.0.0"
matplotlib = { version = "^3.8.3", optional = true}
requests-mock = "^1.12.1"
semantic-router = ">=0.0.20,<0.1.0"
semantic-router = ">=0.0.48,<0.1.0"

[tool.poetry.extras]
stats = ["matplotlib"]
Expand All @@ -28,6 +28,7 @@ pytest = "^7.4.3"
pytest-mock = "^3.12.0"
pytest-cov = "^4.1.0"
pytest-xdist = "^3.5.0"
pytest-asyncio = "^0.23.7"
mypy = "^1.7.1"
types-pyyaml = "^6.0.12.12"
types-requests = "^2.31.0"
Expand Down
2 changes: 1 addition & 1 deletion semantic_chunkers/chunkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from colorama import Fore, Style
from pydantic.v1 import BaseModel, Extra

from semantic_router.encoders.base import BaseEncoder

from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter

Expand Down
52 changes: 49 additions & 3 deletions semantic_chunkers/chunkers/consecutive.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, List
from tqdm.auto import tqdm

import numpy as np

from semantic_router.encoders.base import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter

Expand Down Expand Up @@ -58,6 +58,40 @@ def _chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:
self.chunks = chunks
return chunks

async def _async_chunk(
self, splits: List[Any], batch_size: int = 64
) -> List[Chunk]:
"""Merge splits into chunks using semantic similarity.
:param splits: splits to be merged into chunks.
:return: list of chunks.
"""
split_embeds = []
num_splits = len(splits)
for i in tqdm(range(0, num_splits, batch_size)):
split_embeds.extend(await self.encoder.acall(splits[i : i + batch_size]))
norm_embeds = split_embeds / np.linalg.norm(split_embeds, axis=1, keepdims=True)
sim_matrix = np.matmul(norm_embeds, norm_embeds.T)
chunks = []
curr_split_start_idx = 0

for idx in tqdm(range(1, norm_embeds.shape[0])):
curr_sim_score = sim_matrix[idx - 1][idx]
if idx < len(sim_matrix) and curr_sim_score < self.score_threshold:
chunks.append(
Chunk(
splits=splits[curr_split_start_idx:idx],
is_triggered=True,
triggered_score=curr_sim_score,
)
)
curr_split_start_idx = idx
# append final chunk
chunks.append(Chunk(splits=splits[curr_split_start_idx:]))
self.chunks = chunks
return chunks

def __call__(self, docs: List[Any]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.
Expand All @@ -76,3 +110,15 @@ def __call__(self, docs: List[Any]) -> List[List[Chunk]]:
doc_chunks = self._chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks

async def acall(self, docs: List[Any]) -> List[List[Chunk]]:
all_chunks = []
for doc in docs:
# split the document into sentences (if needed)
if isinstance(doc, str):
splits = self._split(doc)
else:
splits = doc
doc_chunks = await self._async_chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks
81 changes: 78 additions & 3 deletions semantic_chunkers/chunkers/cumulative.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, List
from tqdm.auto import tqdm

import numpy as np

from semantic_router.encoders import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter

Expand Down Expand Up @@ -76,6 +76,62 @@ def _chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:

return chunks

async def _async_chunk(
self, splits: List[Any], batch_size: int = 64
) -> List[Chunk]:
"""Merge splits into chunks using semantic similarity.
:param splits: splits to be merged into chunks.
:return: list of chunks.
"""
chunks = []
curr_chunk_start_idx = 0
num_splits = len(splits)

for idx in tqdm(range(num_splits)):
if idx + 1 < num_splits: # Ensure there is a next document to compare with.
if idx == 0:
# On the first iteration, compare the
# first document directly to the second.
curr_chunk_docs = splits[idx]
else:
# For subsequent iterations, compare cumulative
# documents up to the current one with the next.
curr_chunk_docs = "\n".join(splits[curr_chunk_start_idx : idx + 1])
next_doc = splits[idx + 1]

# Embedding and similarity calculation remains the same.
curr_chunk_docs_embed_result = await self.encoder.acall(
[curr_chunk_docs]
)
next_doc_embed_result = await self.encoder.acall([next_doc])
curr_chunk_docs_embed = curr_chunk_docs_embed_result[0]
next_doc_embed = next_doc_embed_result[0]

curr_sim_score = np.dot(curr_chunk_docs_embed, next_doc_embed) / (
np.linalg.norm(curr_chunk_docs_embed)
* np.linalg.norm(next_doc_embed)
)
# Decision to chunk based on similarity score.
if curr_sim_score < self.score_threshold:
chunks.append(
Chunk(
splits=list(splits[curr_chunk_start_idx : idx + 1]),
is_triggered=True,
triggered_score=curr_sim_score,
)
)
curr_chunk_start_idx = (
idx + 1
) # Update the start index for the next segment.

# Add the last segment after the loop.
if curr_chunk_start_idx < num_splits:
chunks.append(Chunk(splits=list(splits[curr_chunk_start_idx:])))

return chunks

def __call__(self, docs: List[str]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.
Expand All @@ -94,3 +150,22 @@ def __call__(self, docs: List[str]) -> List[List[Chunk]]:
doc_chunks = self._chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks

async def acall(self, docs: List[str]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.
:param docs: list of text documents to be chunk, if only wanted to
chunk a single document, pass it as a list with a single element.
:return: list of list objects containing the chunks.
"""
all_chunks = []
for doc in docs:
# split the document into sentences (if needed)
if isinstance(doc, str):
splits = self._split(doc)
else:
splits = doc
doc_chunks = await self._async_chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks
Loading

0 comments on commit 67b3524

Please sign in to comment.