From cfe4de13b4027dad60fb059c6e4d230241dbd65c Mon Sep 17 00:00:00 2001
From: Lee Miller <dleemiller@protonmail.com>
Date: Sun, 27 Oct 2024 15:00:48 -0600
Subject: [PATCH 1/7] use numpy bitwise count, require numpy>=2

---
 pyproject.toml |  8 ++++----
 setup.py       | 35 +++++------------------------------
 2 files changed, 9 insertions(+), 34 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 89b25b0..9517d77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,17 +1,17 @@
 [build-system]
-requires = ["setuptools", "wheel", "setuptools_scm[toml]", "Cython", "numpy"]
+requires = ["setuptools", "wheel", "setuptools_scm[toml]", "Cython", "numpy>=2"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "wordllama"
 dynamic = ["version"]
-description = "WordLlama Embedding Utility"
+description = "WordLlama NLP Utility"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { file = "LICENSE" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 authors = [{ name = "Lee Miller", email = "dleemiller@gmail.com" }]
 dependencies = [
-  "numpy",
+  "numpy>=2",
   "safetensors",
   "tokenizers",
   "toml",
diff --git a/setup.py b/setup.py
index a5eeb36..cee6f44 100644
--- a/setup.py
+++ b/setup.py
@@ -5,32 +5,9 @@
 
 numpy_include = np.get_include()
 
-extra_compile_args = []
+extra_compile_args = ["-O3", "-ffast-math"]
 extra_link_args = []
 
-if platform.system() == "Darwin":
-    if platform.machine() == "arm64":
-        extra_compile_args.extend(["-arch", "arm64", "-O3", "-ffast-math"])
-        extra_link_args.extend(["-arch", "arm64"])
-    else:
-        extra_compile_args.extend(["-arch", "x86_64", "-O3", "-ffast-math"])
-        extra_link_args.extend(["-arch", "x86_64"])
-elif platform.system() == "Windows":
-    extra_compile_args.extend(["/O2"])
-else:  # Linux and others
-    if platform.machine().startswith("arm"):
-        if platform.architecture()[0] == "32bit":
-            extra_compile_args.extend(["-march=armv7-a", "-mfpu=neon"])
-            extra_link_args.extend(["-march=armv7-a", "-mfpu=neon"])
-        else:  # 64-bit ARM
-            extra_compile_args.extend(["-march=armv8-a"])
-            extra_link_args.extend(["-march=armv8-a"])
-    elif platform.machine() in ["x86_64", "AMD64"]:
-        extra_compile_args.extend(["-march=native", "-mpopcnt"])
-        extra_link_args.extend(["-march=native", "-mpopcnt"])
-
-extra_compile_args.extend(["-O3", "-ffast-math"])
-
 extensions = [
     Extension(
         "wordllama.algorithms.splitter",
@@ -61,7 +38,7 @@
         define_macros=[],
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
-        language="c++"
+        language="c++",
     ),
     Extension(
         "wordllama.algorithms.find_local_minima",
@@ -70,7 +47,7 @@
         define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
-        language="c++"
+        language="c++",
     ),
     Extension(
         "wordllama.algorithms.vector_similarity",
@@ -79,15 +56,13 @@
         define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
-    )
-
-
+    ),
 ]
 
 setup(
     name="Embedding and lightweight NLP utility.",
     use_scm_version=True,
-    setup_requires=['setuptools_scm'],
+    setup_requires=["setuptools_scm"],
     ext_modules=cythonize(
         extensions,
         compiler_directives={

From 88d2d25663f82607a78f2bd515bc59ba682aca58 Mon Sep 17 00:00:00 2001
From: Lee Miller <dleemiller@protonmail.com>
Date: Sun, 27 Oct 2024 15:02:16 -0600
Subject: [PATCH 2/7] removing hardware instructions in favor of numpy
 implementation

---
 wordllama/algorithms/vector_similarity.pyx | 68 +++++++---------------
 1 file changed, 20 insertions(+), 48 deletions(-)

diff --git a/wordllama/algorithms/vector_similarity.pyx b/wordllama/algorithms/vector_similarity.pyx
index 234207c..2bc4e18 100644
--- a/wordllama/algorithms/vector_similarity.pyx
+++ b/wordllama/algorithms/vector_similarity.pyx
@@ -12,36 +12,8 @@ from numpy cimport (
 
 np.import_array()
 
-cdef extern from *:
-    """
-    #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-    #include <x86intrin.h>
-    static inline int popcount(uint64_t x) {
-        return __builtin_popcountll(x);
-    }
-    #elif defined(__GNUC__) && (defined(__ARM_NEON) || defined(__aarch64__))
-    #include <arm_neon.h>
-    static inline int popcount(uint64_t x) {
-        // No direct 64-bit popcount in NEON, need to split into two 32-bit parts
-        uint32_t lo = (uint32_t)x;
-        uint32_t hi = (uint32_t)(x >> 32);
-        return vaddv_u8(vcnt_u8(vcreate_u8(lo))) + vaddv_u8(vcnt_u8(vcreate_u8(hi)));
-    }
-    #else
-    static inline int popcount(uint64_t x) {
-        x = x - ((x >> 1) & 0x5555555555555555);
-        x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
-        x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F;
-        x = x + (x >> 8);
-        x = x + (x >> 16);
-        x = x + (x >> 32);
-        return x & 0x0000007F;
-    }
-    #endif
-    """
-    int popcount(uint64_t x) nogil
-
-cpdef object hamming_distance(object a, object b):
+cpdef object hamming_distance(np.ndarray[np.uint64_t, ndim=2, mode='c'] a,
+                              np.ndarray[np.uint64_t, ndim=2, mode='c'] b):
     """
     Compute the Hamming distance between two arrays of binary vectors.
 
@@ -52,32 +24,32 @@ cpdef object hamming_distance(object a, object b):
     Returns:
         np.ndarray: A 2D array containing the Hamming distances.
     """
-    cdef Py_ssize_t i, j, k
-    cdef int dist
+    cdef Py_ssize_t i
     cdef Py_ssize_t n = a.shape[0]
     cdef Py_ssize_t m = b.shape[0]
     cdef Py_ssize_t width = a.shape[1]
-    
-    # Allocate distance array
-    distance = np.zeros((n, m), dtype=np.uint32)
-    
-    # Create a typed memoryview
-    cdef uint32_t[:, :] distance_view = distance
-
-    # Ensure contiguous
+
     if not a.flags.c_contiguous or not b.flags.c_contiguous:
         raise ValueError("Input arrays must be C-contiguous")
 
-    # Create typed memoryviews
-    cdef uint64_t[:, :] a_view = a
-    cdef uint64_t[:, :] b_view = b
+    cdef np.ndarray[np.uint32_t, ndim=2, mode='c'] distance = np.zeros((n, m), dtype=np.uint32)
+    cdef np.ndarray[np.uint64_t, ndim=1] a_row
+    cdef np.ndarray[np.uint64_t, ndim=2] xor_result
+    cdef np.ndarray[np.uint8_t, ndim=2] popcounts
+    cdef np.ndarray[np.uint32_t, ndim=1] distances_i
 
     for i in range(n):
-        for j in range(m):
-            dist = 0
-            for k in range(width):
-                dist += popcount(a_view[i, k] ^ b_view[j, k])
-            distance_view[i, j] = dist
+        a_row = a[i, :]
+
+        # XOR 'a_row' and all rows in 'b'
+        xor_result = np.bitwise_xor(a_row[np.newaxis, :], b)
+
+        # Compute popcounts
+        popcounts = np.bitwise_count(xor_result)
+
+        # Sum to get Hamming distance
+        distances_i = np.sum(popcounts, axis=1, dtype=np.uint32)
+        distance[i, :] = distances_i
 
     return distance
 

From eb62bb4025e1cb31360ccac55adf901bbbaf7d32 Mon Sep 17 00:00:00 2001
From: Lee Miller <dleemiller@protonmail.com>
Date: Sun, 27 Oct 2024 15:16:35 -0600
Subject: [PATCH 3/7] fix pooling dim

---
 tests/test_functional.py | 10 +++++++++-
 wordllama/inference.py   | 13 ++++++++-----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 7e73df5..542d441 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -3,7 +3,15 @@
 
 
 class TestFunctional(unittest.TestCase):
-
     def test_function_clustering(self):
         wl = WordLlama.load()
         wl.cluster(["a", "b"], k=2)
+
+    def test_function_similarity(self):
+        wl = WordLlama.load()
+        wl.similarity("a", "b")
+
+    def test_function_similarity_binary(self):
+        wl = WordLlama.load()
+        wl.binary = True
+        wl.similarity("a", "b")
diff --git a/wordllama/inference.py b/wordllama/inference.py
index bf4ac3c..ff2f84f 100644
--- a/wordllama/inference.py
+++ b/wordllama/inference.py
@@ -94,7 +94,10 @@ def embed(
         num_texts = len(texts)
         embedding_dim = self.embedding.shape[1]
         np_type = np.float32 if not self.binary else np.uint64
-        pooled_embeddings = np.empty((num_texts, embedding_dim), dtype=np_type)
+        pooled_embeddings = np.empty(
+            (num_texts, embedding_dim if not self.binary else embedding_dim // 64),
+            dtype=np_type,
+        )
 
         for i in range(0, num_texts, batch_size):
             chunk = texts[i : i + batch_size]
@@ -209,10 +212,10 @@ def rank(
 
     def deduplicate(
         self,
-	docs: List[str],
-	threshold: float = 0.9,
-	return_indices: bool = False,
-	batch_size: Optional[int] = None
+        docs: List[str],
+        threshold: float = 0.9,
+        return_indices: bool = False,
+        batch_size: Optional[int] = None,
     ) -> List[Union[str, int]]:
         """Deduplicate documents based on a similarity threshold.
 

From ab1bc754b647381c388c676a35fbe573d78c2c47 Mon Sep 17 00:00:00 2001
From: Lee Miller <dleemiller@protonmail.com>
Date: Sun, 27 Oct 2024 17:36:48 -0600
Subject: [PATCH 4/7] updating macros

---
 setup.py                                     |   9 +-
 wordllama/algorithms/bm25_scorer.pyx         |  44 +++++
 wordllama/algorithms/deduplicate_helpers.pyx |   1 -
 wordllama/algorithms/kmeans.pyx              |   1 -
 wordllama/algorithms/ragfile.pyx             | 119 +++++++++++++
 wordllama/algorithms/topic_model.pyx         | 173 +++++++++++++++++++
 wordllama/algorithms/vector_similarity.pyx   |   1 -
 7 files changed, 341 insertions(+), 7 deletions(-)
 create mode 100644 wordllama/algorithms/bm25_scorer.pyx
 create mode 100644 wordllama/algorithms/ragfile.pyx
 create mode 100644 wordllama/algorithms/topic_model.pyx

diff --git a/setup.py b/setup.py
index cee6f44..161beee 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@
 
 extra_compile_args = ["-O3", "-ffast-math"]
 extra_link_args = []
+define_macros = [("NPY_NO_DEPRECATED_API", "NPY_2_0_API_VERSION")]
 
 extensions = [
     Extension(
@@ -19,7 +20,7 @@
         "wordllama.algorithms.deduplicate_helpers",
         ["wordllama/algorithms/deduplicate_helpers.pyx"],
         include_dirs=[numpy_include],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        define_macros=define_macros,
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
     ),
@@ -27,7 +28,7 @@
         "wordllama.algorithms.kmeans",
         ["wordllama/algorithms/kmeans.pyx"],
         include_dirs=[numpy_include],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        define_macros=define_macros,
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
     ),
@@ -44,7 +45,7 @@
         "wordllama.algorithms.find_local_minima",
         ["wordllama/algorithms/find_local_minima.pyx"],
         include_dirs=[numpy_include],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        define_macros=define_macros,
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
         language="c++",
@@ -53,7 +54,7 @@
         "wordllama.algorithms.vector_similarity",
         ["wordllama/algorithms/vector_similarity.pyx"],
         include_dirs=[numpy_include],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        define_macros=define_macros,
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
     ),
diff --git a/wordllama/algorithms/bm25_scorer.pyx b/wordllama/algorithms/bm25_scorer.pyx
new file mode 100644
index 0000000..7f5c79e
--- /dev/null
+++ b/wordllama/algorithms/bm25_scorer.pyx
@@ -0,0 +1,44 @@
+import numpy as np
+cimport numpy as np
+from cython cimport boundscheck, wraparound
+
+# Memoryviews allow fast access to arrays
+@boundscheck(False)  # Disable bounds checking for performance
+@wraparound(False)   # Disable negative index wraparound for performance
+cpdef void optimized_search(
+    np.ndarray[np.int32_t, ndim=1] query_idx, 
+    np.ndarray[np.float32_t, ndim=2] similarity_matrix,
+    np.ndarray[np.float32_t, ndim=1] idf_vector,
+    np.ndarray[np.float32_t, ndim=1] scores,
+    list tokenized_texts,
+    float k1, float b, float avg_doc_len, int top_k):
+
+    cdef int i, j, doc_length
+    cdef float alpha, beta, fq_sum, score_sum
+    cdef np.ndarray[np.int32_t, ndim=1] doc_idx
+    cdef np.ndarray[np.float32_t, ndim=1] fq
+    cdef np.ndarray[np.int32_t, ndim=2] mesh1, mesh2
+    cdef np.ndarray[np.float32_t, ndim=1] temp_score
+
+    alpha = k1 + 1
+
+    # Loop through the documents (tokenized_texts)
+    for i in range(len(tokenized_texts)):
+        doc = tokenized_texts[i]
+        doc_length = np.sum(doc.attention_mask)  # Fast sum via NumPy
+        beta = k1 * (1 - b + b * (doc_length / avg_doc_len))
+        
+        doc_idx = np.array([x for x in doc.ids if x > 0], dtype=np.int32)
+        
+        # Meshgrid to compute query-document term interactions
+        mesh1, mesh2 = np.meshgrid(query_idx, doc_idx, indexing='ij')
+        
+        # Access similarity_matrix via memoryviews
+        fq = similarity_matrix[mesh1, mesh2].sum(axis=0)
+        
+        # Reshape idf_vector[query_idx] to enable broadcasting
+        temp_score = (idf_vector[query_idx].reshape((-1, 1)) * ((fq * alpha) / (fq + beta))).sum(axis=0)
+        
+        # Sum the score for the document
+        scores[i] = np.sum(temp_score)
+
diff --git a/wordllama/algorithms/deduplicate_helpers.pyx b/wordllama/algorithms/deduplicate_helpers.pyx
index 52de559..6b9e805 100644
--- a/wordllama/algorithms/deduplicate_helpers.pyx
+++ b/wordllama/algorithms/deduplicate_helpers.pyx
@@ -1,5 +1,4 @@
 # cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True
-# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
 
 import numpy as np
 cimport numpy as np
diff --git a/wordllama/algorithms/kmeans.pyx b/wordllama/algorithms/kmeans.pyx
index 5d285bd..3a48a2e 100644
--- a/wordllama/algorithms/kmeans.pyx
+++ b/wordllama/algorithms/kmeans.pyx
@@ -1,5 +1,4 @@
 # cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True, fastmath=True
-# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
 
 import numpy as np
 from numpy.random import RandomState
diff --git a/wordllama/algorithms/ragfile.pyx b/wordllama/algorithms/ragfile.pyx
new file mode 100644
index 0000000..cd6cada
--- /dev/null
+++ b/wordllama/algorithms/ragfile.pyx
@@ -0,0 +1,119 @@
+# wordllama/algorithms/ragfile.pyx
+
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.string cimport memcmp, memset, memcpy
+from libc.time cimport time
+from libc.stdlib cimport malloc, free
+import uuid
+import numpy as np
+cimport numpy as np
+
+# Declare the magic number array
+cdef uint8_t MAGIC_NUMBER[4]
+
+# Set the magic number manually
+MAGIC_NUMBER[0] = 0x52  # 'R'
+MAGIC_NUMBER[1] = 0x41  # 'A'
+MAGIC_NUMBER[2] = 0x47  # 'G'
+MAGIC_NUMBER[3] = 0x01  # '\x01'
+
+cdef const uint16_t HEADER_VERSION = 1
+
+cdef struct Header:
+    uint8_t magic_number[4]
+    uint16_t version
+    uint16_t flags
+    uint32_t vector_dim
+    uint8_t file_hash[32]
+    uint8_t is_binary
+    uint8_t model_id_hash[32]
+    uint64_t data_size
+    char data_format[16]
+    uint64_t timestamp
+    uint8_t uuid[16]
+    uint32_t header_checksum
+
+# Function to calculate UUID
+cdef void calculate_uuid(uint8_t* uuid_bytes):
+    cdef uuid.UUID u = uuid.uuid4()
+    memcpy(uuid_bytes, u.bytes, 16)
+
+# Function to calculate checksum
+cdef uint32_t calculate_checksum(Header* header):
+    cdef uint32_t checksum = 0
+    cdef uint8_t* data = <uint8_t*>header
+    cdef int i
+    for i in range(sizeof(Header) - 4):  # Exclude the checksum field itself
+        checksum += data[i]
+    return checksum
+
+# Function to create the header
+cdef Header* create_header(uint32_t vector_dim, uint8_t is_binary, char* model_id_hash, uint64_t data_size, char* data_format):
+    cdef Header* header = <Header*>malloc(sizeof(Header))
+    if not header:
+        raise MemoryError("Failed to allocate memory for Header")
+    
+    memcpy(header.magic_number, MAGIC_NUMBER, 4)
+    header.version = HEADER_VERSION
+    header.flags = 0
+    header.vector_dim = vector_dim
+    memset(header.file_hash, 0, 32)  # You can later fill this with an actual hash
+    header.is_binary = is_binary
+    memcpy(header.model_id_hash, model_id_hash, 32)
+    header.data_size = data_size
+    memcpy(header.data_format, data_format, 16)
+    header.timestamp = time(NULL)
+    calculate_uuid(header.uuid)
+    header.header_checksum = calculate_checksum(header)
+
+    return header
+
+# Function to write the ragfile
+cdef void write_ragfile(const char* filename, Header* header, np.ndarray[np.uint8_t, ndim=1] embeddings, np.ndarray[np.uint8_t, ndim=1] binary_data):
+    cdef FILE* f = fopen(filename, "wb")
+    if not f:
+        raise IOError("Failed to open file for writing")
+    
+    # Write the static header
+    fwrite(header, sizeof(Header), 1, f)
+    
+    # Write the embeddings
+    fwrite(<void*>embeddings.data, 1, embeddings.nbytes, f)
+
+    # Calculate and write the padding
+    cdef int padding_size = 4096 - (sizeof(Header) + embeddings.nbytes)
+    cdef char padding[4096]
+    memset(padding, 0, padding_size)
+    fwrite(padding, 1, padding_size, f)
+
+    # Write the binary data
+    fwrite(<void*>binary_data.data, 1, binary_data.nbytes, f)
+
+    fclose(f)
+
+# Function to read the string from the ragfile
+cdef char* read_string_from_ragfile(const char* filename, Header* header):
+    cdef FILE* f = fopen(filename, "rb")
+    if not f:
+        raise IOError("Failed to open file for reading")
+    
+    # Skip the header and padding
+    fseek(f, 4096, SEEK_SET)
+    
+    # Allocate space for the string
+    cdef char* string_data = <char*>malloc(header.data_size)
+    if not string_data:
+        fclose(f)
+        raise MemoryError("Failed to allocate memory for string data")
+
+    fread(string_data, header.data_size, 1, f)
+    fclose(f)
+    
+    return string_data
+
+# Function to deallocate the header
+cdef void deallocate_header(Header* header):
+    if header:
+        free(header)
+
diff --git a/wordllama/algorithms/topic_model.pyx b/wordllama/algorithms/topic_model.pyx
new file mode 100644
index 0000000..7d724cd
--- /dev/null
+++ b/wordllama/algorithms/topic_model.pyx
@@ -0,0 +1,173 @@
+# topic_model.pyx
+
+# Cython directives for optimization
+# Disable bounds checking and wraparound for speed
+# Ensure that array accesses are safe
+# These can also be set in setup.py if preferred
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+
+import cython
+from cython cimport boundscheck, wraparound, cdivision
+import numpy as np
+cimport numpy as np
+
+from collections import Counter
+from itertools import islice
+import tqdm
+
+# Type definitions for clarity and performance
+ctypedef np.float64_t FLOAT
+ctypedef np.int64_t INT
+
+# Function to generate n-grams (remains in Python for flexibility)
+def generate_ngrams(token_ids, n=4):
+    """
+    Generate n-grams from the list of token ids.
+
+    Parameters:
+    - token_ids: List of token IDs.
+    - n: The number of tokens in each n-gram.
+
+    Returns:
+    - A generator of n-grams.
+    """
+    return zip(*(islice(token_ids, i, None) for i in range(n)))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef list combine_overlapping_ngrams_cython(list ngrams_with_scores, int n, int k):
+    """
+    Combine overlapping n-grams within the top-k window.
+
+    Overlapping is defined as sharing (len(ngram) - 1) tokens.
+
+    Parameters:
+    - ngrams_with_scores: List of tuples ((ngram), score) sorted by score descending.
+    - n: The length of the n-grams.
+    - k: The desired number of combined n-grams.
+
+    Returns:
+    - List of combined n-grams with their aggregated scores.
+    """
+    cdef list combined_ngrams = []
+    cdef set used_token_ids = set()
+    cdef int i, j, overlap, idx
+    cdef tuple ngram
+    cdef float score
+    cdef tuple existing_ngram
+    cdef float existing_score
+    cdef tuple merged_ngram
+    cdef float merged_score
+    cdef int token_id
+    cdef bint overlap_flag
+
+    for i in range(len(ngrams_with_scores)):
+        if len(combined_ngrams) >= k:
+            break
+        ngram, score = ngrams_with_scores[i]
+
+        # Check if any token_id is already used
+        overlap_flag = False
+        for token_id in ngram:
+            if token_id in used_token_ids:
+                overlap_flag = True
+                break
+
+        if not overlap_flag:
+            combined_ngrams.append((ngram, score))
+            for token_id in ngram:
+                used_token_ids.add(token_id)
+        else:
+            # Attempt to merge with existing n-grams
+            for idx in range(len(combined_ngrams)):
+                existing_ngram, existing_score = combined_ngrams[idx]
+                overlap = 0
+                for j in range(1, n):
+                    # Replace negative indices with positive indices
+                    # existing_ngram[-j:] -> existing_ngram[n - j :]
+                    # ngram[:j] remains the same
+                    if existing_ngram[n - j :] == ngram[:j]:
+                        overlap = j
+                if overlap == n - 1:
+                    # Merge the n-grams
+                    # Replace ngram[-1] with ngram[n - 1]
+                    merged_ngram = existing_ngram + (ngram[n - 1],)
+                    merged_score = existing_score + score  # Aggregation method
+                    combined_ngrams[idx] = (merged_ngram, merged_score)
+                    used_token_ids.add(ngram[n - 1])
+                    break
+
+    return combined_ngrams[:k]
+
+
+def top_k_token_ngrams(texts, wl, int k=10, int n=3):
+    """
+    Extract the top-k non-overlapping n-grams from the texts.
+
+    Parameters:
+    - texts: List of texts (each text is a string).
+    - wl: Language model with tokenizer and embeddings.
+    - k: Number of top n-grams to return.
+    - n: The number of tokens in each n-gram.
+
+    Returns:
+    - List of top-k n-grams with their scores.
+    """
+    # Use Python's Counter since it's optimized and efficient
+    trigram_counter = Counter()
+    # Ensure wl.embedding is a NumPy array of type float64
+    magnitudes_np = np.linalg.norm(wl.embedding, axis=1, keepdims=True)
+    # Cast to float64 to match FLOAT
+    magnitudes = magnitudes_np.astype(np.float64)
+    cdef np.ndarray[FLOAT, ndim=2] magnitudes_c = magnitudes
+
+    # Iterate over each tokenized text (list of token ids)
+    for batch in tqdm.tqdm(texts, desc="Processing texts"):
+        tokenized_text = wl.tokenize([batch])
+        for x in tokenized_text:
+            ngrams = generate_ngrams(x.ids, n)
+            trigram_counter.update(ngrams)
+
+    # Get the top 10 * k most common n-grams
+    ngrams = trigram_counter.most_common(10 * k)
+    importances = []
+    counts = []
+    cdef tuple ngram
+    cdef int count
+    cdef float importance
+    cdef int i
+
+    for ngram, count in ngrams:
+        importance = 0.0
+        for token_id in ngram:
+            importance += magnitudes_c[token_id, 0]
+        importances.append(importance)
+        counts.append(count)
+
+    iar = np.array(importances, dtype=np.float64)
+    counts_arr = np.array(counts, dtype=np.float64)
+
+    # Compute scores in Python using NumPy's optimized functions
+    scores = []
+    sorted_iar = np.sort(iar)
+    sorted_counts = np.sort(counts_arr)
+    for i in range(len(ngrams)):
+        p0 = np.searchsorted(sorted_iar, iar[i], side='right') / len(iar)
+        p1 = np.searchsorted(sorted_counts, counts_arr[i], side='right') / len(counts_arr)
+        score = (p0 + p1) / 2.0
+        scores.append(score)
+
+    # Combine ngrams with their scores
+    ngrams_with_scores = list(zip([ngram for ngram, _ in ngrams], scores))
+
+    # Sort ngrams by score in descending order
+    ngrams_with_scores.sort(key=lambda x: x[1], reverse=True)
+
+    # Combine overlapping ngrams using the Cython-optimized function
+    combined_ngrams = combine_overlapping_ngrams_cython(ngrams_with_scores, n, k)
+
+    return combined_ngrams
+
diff --git a/wordllama/algorithms/vector_similarity.pyx b/wordllama/algorithms/vector_similarity.pyx
index 2bc4e18..232dfea 100644
--- a/wordllama/algorithms/vector_similarity.pyx
+++ b/wordllama/algorithms/vector_similarity.pyx
@@ -1,5 +1,4 @@
 # cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True, nonecheck=False
-# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
 
 import numpy as np
 cimport numpy as np

From 57a904818ed3452ceaf1ac24c70cee291c7fb167 Mon Sep 17 00:00:00 2001
From: Lee Miller <80222060+dleemiller@users.noreply.github.com>
Date: Sun, 27 Oct 2024 19:33:38 -0600
Subject: [PATCH 5/7] Delete wordllama/algorithms/bm25_scorer.pyx

---
 wordllama/algorithms/bm25_scorer.pyx | 44 ----------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 wordllama/algorithms/bm25_scorer.pyx

diff --git a/wordllama/algorithms/bm25_scorer.pyx b/wordllama/algorithms/bm25_scorer.pyx
deleted file mode 100644
index 7f5c79e..0000000
--- a/wordllama/algorithms/bm25_scorer.pyx
+++ /dev/null
@@ -1,44 +0,0 @@
-import numpy as np
-cimport numpy as np
-from cython cimport boundscheck, wraparound
-
-# Memoryviews allow fast access to arrays
-@boundscheck(False)  # Disable bounds checking for performance
-@wraparound(False)   # Disable negative index wraparound for performance
-cpdef void optimized_search(
-    np.ndarray[np.int32_t, ndim=1] query_idx, 
-    np.ndarray[np.float32_t, ndim=2] similarity_matrix,
-    np.ndarray[np.float32_t, ndim=1] idf_vector,
-    np.ndarray[np.float32_t, ndim=1] scores,
-    list tokenized_texts,
-    float k1, float b, float avg_doc_len, int top_k):
-
-    cdef int i, j, doc_length
-    cdef float alpha, beta, fq_sum, score_sum
-    cdef np.ndarray[np.int32_t, ndim=1] doc_idx
-    cdef np.ndarray[np.float32_t, ndim=1] fq
-    cdef np.ndarray[np.int32_t, ndim=2] mesh1, mesh2
-    cdef np.ndarray[np.float32_t, ndim=1] temp_score
-
-    alpha = k1 + 1
-
-    # Loop through the documents (tokenized_texts)
-    for i in range(len(tokenized_texts)):
-        doc = tokenized_texts[i]
-        doc_length = np.sum(doc.attention_mask)  # Fast sum via NumPy
-        beta = k1 * (1 - b + b * (doc_length / avg_doc_len))
-        
-        doc_idx = np.array([x for x in doc.ids if x > 0], dtype=np.int32)
-        
-        # Meshgrid to compute query-document term interactions
-        mesh1, mesh2 = np.meshgrid(query_idx, doc_idx, indexing='ij')
-        
-        # Access similarity_matrix via memoryviews
-        fq = similarity_matrix[mesh1, mesh2].sum(axis=0)
-        
-        # Reshape idf_vector[query_idx] to enable broadcasting
-        temp_score = (idf_vector[query_idx].reshape((-1, 1)) * ((fq * alpha) / (fq + beta))).sum(axis=0)
-        
-        # Sum the score for the document
-        scores[i] = np.sum(temp_score)
-

From 6c96c32aadb90ba5148419b297d6e3085cfc53b4 Mon Sep 17 00:00:00 2001
From: Lee Miller <80222060+dleemiller@users.noreply.github.com>
Date: Sun, 27 Oct 2024 19:33:54 -0600
Subject: [PATCH 6/7] Delete wordllama/algorithms/ragfile.pyx

---
 wordllama/algorithms/ragfile.pyx | 119 -------------------------------
 1 file changed, 119 deletions(-)
 delete mode 100644 wordllama/algorithms/ragfile.pyx

diff --git a/wordllama/algorithms/ragfile.pyx b/wordllama/algorithms/ragfile.pyx
deleted file mode 100644
index cd6cada..0000000
--- a/wordllama/algorithms/ragfile.pyx
+++ /dev/null
@@ -1,119 +0,0 @@
-# wordllama/algorithms/ragfile.pyx
-
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.string cimport memcmp, memset, memcpy
-from libc.time cimport time
-from libc.stdlib cimport malloc, free
-import uuid
-import numpy as np
-cimport numpy as np
-
-# Declare the magic number array
-cdef uint8_t MAGIC_NUMBER[4]
-
-# Set the magic number manually
-MAGIC_NUMBER[0] = 0x52  # 'R'
-MAGIC_NUMBER[1] = 0x41  # 'A'
-MAGIC_NUMBER[2] = 0x47  # 'G'
-MAGIC_NUMBER[3] = 0x01  # '\x01'
-
-cdef const uint16_t HEADER_VERSION = 1
-
-cdef struct Header:
-    uint8_t magic_number[4]
-    uint16_t version
-    uint16_t flags
-    uint32_t vector_dim
-    uint8_t file_hash[32]
-    uint8_t is_binary
-    uint8_t model_id_hash[32]
-    uint64_t data_size
-    char data_format[16]
-    uint64_t timestamp
-    uint8_t uuid[16]
-    uint32_t header_checksum
-
-# Function to calculate UUID
-cdef void calculate_uuid(uint8_t* uuid_bytes):
-    cdef uuid.UUID u = uuid.uuid4()
-    memcpy(uuid_bytes, u.bytes, 16)
-
-# Function to calculate checksum
-cdef uint32_t calculate_checksum(Header* header):
-    cdef uint32_t checksum = 0
-    cdef uint8_t* data = <uint8_t*>header
-    cdef int i
-    for i in range(sizeof(Header) - 4):  # Exclude the checksum field itself
-        checksum += data[i]
-    return checksum
-
-# Function to create the header
-cdef Header* create_header(uint32_t vector_dim, uint8_t is_binary, char* model_id_hash, uint64_t data_size, char* data_format):
-    cdef Header* header = <Header*>malloc(sizeof(Header))
-    if not header:
-        raise MemoryError("Failed to allocate memory for Header")
-    
-    memcpy(header.magic_number, MAGIC_NUMBER, 4)
-    header.version = HEADER_VERSION
-    header.flags = 0
-    header.vector_dim = vector_dim
-    memset(header.file_hash, 0, 32)  # You can later fill this with an actual hash
-    header.is_binary = is_binary
-    memcpy(header.model_id_hash, model_id_hash, 32)
-    header.data_size = data_size
-    memcpy(header.data_format, data_format, 16)
-    header.timestamp = time(NULL)
-    calculate_uuid(header.uuid)
-    header.header_checksum = calculate_checksum(header)
-
-    return header
-
-# Function to write the ragfile
-cdef void write_ragfile(const char* filename, Header* header, np.ndarray[np.uint8_t, ndim=1] embeddings, np.ndarray[np.uint8_t, ndim=1] binary_data):
-    cdef FILE* f = fopen(filename, "wb")
-    if not f:
-        raise IOError("Failed to open file for writing")
-    
-    # Write the static header
-    fwrite(header, sizeof(Header), 1, f)
-    
-    # Write the embeddings
-    fwrite(<void*>embeddings.data, 1, embeddings.nbytes, f)
-
-    # Calculate and write the padding
-    cdef int padding_size = 4096 - (sizeof(Header) + embeddings.nbytes)
-    cdef char padding[4096]
-    memset(padding, 0, padding_size)
-    fwrite(padding, 1, padding_size, f)
-
-    # Write the binary data
-    fwrite(<void*>binary_data.data, 1, binary_data.nbytes, f)
-
-    fclose(f)
-
-# Function to read the string from the ragfile
-cdef char* read_string_from_ragfile(const char* filename, Header* header):
-    cdef FILE* f = fopen(filename, "rb")
-    if not f:
-        raise IOError("Failed to open file for reading")
-    
-    # Skip the header and padding
-    fseek(f, 4096, SEEK_SET)
-    
-    # Allocate space for the string
-    cdef char* string_data = <char*>malloc(header.data_size)
-    if not string_data:
-        fclose(f)
-        raise MemoryError("Failed to allocate memory for string data")
-
-    fread(string_data, header.data_size, 1, f)
-    fclose(f)
-    
-    return string_data
-
-# Function to deallocate the header
-cdef void deallocate_header(Header* header):
-    if header:
-        free(header)
-

From aac96c18f4a231e79c7c557b531275fff6e8ce56 Mon Sep 17 00:00:00 2001
From: Lee Miller <80222060+dleemiller@users.noreply.github.com>
Date: Sun, 27 Oct 2024 19:34:07 -0600
Subject: [PATCH 7/7] Delete wordllama/algorithms/topic_model.pyx

---
 wordllama/algorithms/topic_model.pyx | 173 ---------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 wordllama/algorithms/topic_model.pyx

diff --git a/wordllama/algorithms/topic_model.pyx b/wordllama/algorithms/topic_model.pyx
deleted file mode 100644
index 7d724cd..0000000
--- a/wordllama/algorithms/topic_model.pyx
+++ /dev/null
@@ -1,173 +0,0 @@
-# topic_model.pyx
-
-# Cython directives for optimization
-# Disable bounds checking and wraparound for speed
-# Ensure that array accesses are safe
-# These can also be set in setup.py if preferred
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: cdivision=True
-
-import cython
-from cython cimport boundscheck, wraparound, cdivision
-import numpy as np
-cimport numpy as np
-
-from collections import Counter
-from itertools import islice
-import tqdm
-
-# Type definitions for clarity and performance
-ctypedef np.float64_t FLOAT
-ctypedef np.int64_t INT
-
-# Function to generate n-grams (remains in Python for flexibility)
-def generate_ngrams(token_ids, n=4):
-    """
-    Generate n-grams from the list of token ids.
-
-    Parameters:
-    - token_ids: List of token IDs.
-    - n: The number of tokens in each n-gram.
-
-    Returns:
-    - A generator of n-grams.
-    """
-    return zip(*(islice(token_ids, i, None) for i in range(n)))
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef list combine_overlapping_ngrams_cython(list ngrams_with_scores, int n, int k):
-    """
-    Combine overlapping n-grams within the top-k window.
-
-    Overlapping is defined as sharing (len(ngram) - 1) tokens.
-
-    Parameters:
-    - ngrams_with_scores: List of tuples ((ngram), score) sorted by score descending.
-    - n: The length of the n-grams.
-    - k: The desired number of combined n-grams.
-
-    Returns:
-    - List of combined n-grams with their aggregated scores.
-    """
-    cdef list combined_ngrams = []
-    cdef set used_token_ids = set()
-    cdef int i, j, overlap, idx
-    cdef tuple ngram
-    cdef float score
-    cdef tuple existing_ngram
-    cdef float existing_score
-    cdef tuple merged_ngram
-    cdef float merged_score
-    cdef int token_id
-    cdef bint overlap_flag
-
-    for i in range(len(ngrams_with_scores)):
-        if len(combined_ngrams) >= k:
-            break
-        ngram, score = ngrams_with_scores[i]
-
-        # Check if any token_id is already used
-        overlap_flag = False
-        for token_id in ngram:
-            if token_id in used_token_ids:
-                overlap_flag = True
-                break
-
-        if not overlap_flag:
-            combined_ngrams.append((ngram, score))
-            for token_id in ngram:
-                used_token_ids.add(token_id)
-        else:
-            # Attempt to merge with existing n-grams
-            for idx in range(len(combined_ngrams)):
-                existing_ngram, existing_score = combined_ngrams[idx]
-                overlap = 0
-                for j in range(1, n):
-                    # Replace negative indices with positive indices
-                    # existing_ngram[-j:] -> existing_ngram[n - j :]
-                    # ngram[:j] remains the same
-                    if existing_ngram[n - j :] == ngram[:j]:
-                        overlap = j
-                if overlap == n - 1:
-                    # Merge the n-grams
-                    # Replace ngram[-1] with ngram[n - 1]
-                    merged_ngram = existing_ngram + (ngram[n - 1],)
-                    merged_score = existing_score + score  # Aggregation method
-                    combined_ngrams[idx] = (merged_ngram, merged_score)
-                    used_token_ids.add(ngram[n - 1])
-                    break
-
-    return combined_ngrams[:k]
-
-
-def top_k_token_ngrams(texts, wl, int k=10, int n=3):
-    """
-    Extract the top-k non-overlapping n-grams from the texts.
-
-    Parameters:
-    - texts: List of texts (each text is a string).
-    - wl: Language model with tokenizer and embeddings.
-    - k: Number of top n-grams to return.
-    - n: The number of tokens in each n-gram.
-
-    Returns:
-    - List of top-k n-grams with their scores.
-    """
-    # Use Python's Counter since it's optimized and efficient
-    trigram_counter = Counter()
-    # Ensure wl.embedding is a NumPy array of type float64
-    magnitudes_np = np.linalg.norm(wl.embedding, axis=1, keepdims=True)
-    # Cast to float64 to match FLOAT
-    magnitudes = magnitudes_np.astype(np.float64)
-    cdef np.ndarray[FLOAT, ndim=2] magnitudes_c = magnitudes
-
-    # Iterate over each tokenized text (list of token ids)
-    for batch in tqdm.tqdm(texts, desc="Processing texts"):
-        tokenized_text = wl.tokenize([batch])
-        for x in tokenized_text:
-            ngrams = generate_ngrams(x.ids, n)
-            trigram_counter.update(ngrams)
-
-    # Get the top 10 * k most common n-grams
-    ngrams = trigram_counter.most_common(10 * k)
-    importances = []
-    counts = []
-    cdef tuple ngram
-    cdef int count
-    cdef float importance
-    cdef int i
-
-    for ngram, count in ngrams:
-        importance = 0.0
-        for token_id in ngram:
-            importance += magnitudes_c[token_id, 0]
-        importances.append(importance)
-        counts.append(count)
-
-    iar = np.array(importances, dtype=np.float64)
-    counts_arr = np.array(counts, dtype=np.float64)
-
-    # Compute scores in Python using NumPy's optimized functions
-    scores = []
-    sorted_iar = np.sort(iar)
-    sorted_counts = np.sort(counts_arr)
-    for i in range(len(ngrams)):
-        p0 = np.searchsorted(sorted_iar, iar[i], side='right') / len(iar)
-        p1 = np.searchsorted(sorted_counts, counts_arr[i], side='right') / len(counts_arr)
-        score = (p0 + p1) / 2.0
-        scores.append(score)
-
-    # Combine ngrams with their scores
-    ngrams_with_scores = list(zip([ngram for ngram, _ in ngrams], scores))
-
-    # Sort ngrams by score in descending order
-    ngrams_with_scores.sort(key=lambda x: x[1], reverse=True)
-
-    # Combine overlapping ngrams using the Cython-optimized function
-    combined_ngrams = combine_overlapping_ngrams_cython(ngrams_with_scores, n, k)
-
-    return combined_ngrams
-