From cfe4de13b4027dad60fb059c6e4d230241dbd65c Mon Sep 17 00:00:00 2001 From: Lee Miller Date: Sun, 27 Oct 2024 15:00:48 -0600 Subject: [PATCH 1/7] use numpy bitwise count, require numpy>=2 --- pyproject.toml | 8 ++++---- setup.py | 35 +++++------------------------------ 2 files changed, 9 insertions(+), 34 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 89b25b0..9517d77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,17 @@ [build-system] -requires = ["setuptools", "wheel", "setuptools_scm[toml]", "Cython", "numpy"] +requires = ["setuptools", "wheel", "setuptools_scm[toml]", "Cython", "numpy>=2"] build-backend = "setuptools.build_meta" [project] name = "wordllama" dynamic = ["version"] -description = "WordLlama Embedding Utility" +description = "WordLlama NLP Utility" readme = { file = "README.md", content-type = "text/markdown" } license = { file = "LICENSE" } -requires-python = ">=3.8" +requires-python = ">=3.9" authors = [{ name = "Lee Miller", email = "dleemiller@gmail.com" }] dependencies = [ - "numpy", + "numpy>=2", "safetensors", "tokenizers", "toml", diff --git a/setup.py b/setup.py index a5eeb36..cee6f44 100644 --- a/setup.py +++ b/setup.py @@ -5,32 +5,9 @@ numpy_include = np.get_include() -extra_compile_args = [] +extra_compile_args = ["-O3", "-ffast-math"] extra_link_args = [] -if platform.system() == "Darwin": - if platform.machine() == "arm64": - extra_compile_args.extend(["-arch", "arm64", "-O3", "-ffast-math"]) - extra_link_args.extend(["-arch", "arm64"]) - else: - extra_compile_args.extend(["-arch", "x86_64", "-O3", "-ffast-math"]) - extra_link_args.extend(["-arch", "x86_64"]) -elif platform.system() == "Windows": - extra_compile_args.extend(["/O2"]) -else: # Linux and others - if platform.machine().startswith("arm"): - if platform.architecture()[0] == "32bit": - extra_compile_args.extend(["-march=armv7-a", "-mfpu=neon"]) - extra_link_args.extend(["-march=armv7-a", "-mfpu=neon"]) - else: # 64-bit ARM - extra_compile_args.extend(["-march=armv8-a"]) - extra_link_args.extend(["-march=armv8-a"]) - elif platform.machine() in ["x86_64", "AMD64"]: - extra_compile_args.extend(["-march=native", "-mpopcnt"]) - extra_link_args.extend(["-march=native", "-mpopcnt"]) - -extra_compile_args.extend(["-O3", "-ffast-math"]) - extensions = [ Extension( "wordllama.algorithms.splitter", @@ -61,7 +38,7 @@ define_macros=[], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, - language="c++" + language="c++", ), Extension( "wordllama.algorithms.find_local_minima", @@ -70,7 +47,7 @@ define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, - language="c++" + language="c++", ), Extension( "wordllama.algorithms.vector_similarity", @@ -79,15 +56,13 @@ define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, - ) - - + ), ] setup( name="Embedding and lightweight NLP utility.", use_scm_version=True, - setup_requires=['setuptools_scm'], + setup_requires=["setuptools_scm"], ext_modules=cythonize( extensions, compiler_directives={ From 88d2d25663f82607a78f2bd515bc59ba682aca58 Mon Sep 17 00:00:00 2001 From: Lee Miller Date: Sun, 27 Oct 2024 15:02:16 -0600 Subject: [PATCH 2/7] removing hardware instructions in favor of numpy implementation --- wordllama/algorithms/vector_similarity.pyx | 68 +++++++--------------- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/wordllama/algorithms/vector_similarity.pyx b/wordllama/algorithms/vector_similarity.pyx index 234207c..2bc4e18 100644 --- a/wordllama/algorithms/vector_similarity.pyx +++ b/wordllama/algorithms/vector_similarity.pyx @@ -12,36 +12,8 @@ from numpy cimport ( np.import_array() -cdef extern from *: - """ - #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) - #include - static inline int popcount(uint64_t x) { - return __builtin_popcountll(x); - } - #elif defined(__GNUC__) && (defined(__ARM_NEON) || defined(__aarch64__)) - #include - static inline int popcount(uint64_t x) { - // No direct 64-bit popcount in NEON, need to split into two 32-bit parts - uint32_t lo = (uint32_t)x; - uint32_t hi = (uint32_t)(x >> 32); - return vaddv_u8(vcnt_u8(vcreate_u8(lo))) + vaddv_u8(vcnt_u8(vcreate_u8(hi))); - } - #else - static inline int popcount(uint64_t x) { - x = x - ((x >> 1) & 0x5555555555555555); - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); - x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F; - x = x + (x >> 8); - x = x + (x >> 16); - x = x + (x >> 32); - return x & 0x0000007F; - } - #endif - """ - int popcount(uint64_t x) nogil - -cpdef object hamming_distance(object a, object b): +cpdef object hamming_distance(np.ndarray[np.uint64_t, ndim=2, mode='c'] a, + np.ndarray[np.uint64_t, ndim=2, mode='c'] b): """ Compute the Hamming distance between two arrays of binary vectors. @@ -52,32 +24,32 @@ cpdef object hamming_distance(object a, object b): Returns: np.ndarray: A 2D array containing the Hamming distances. """ - cdef Py_ssize_t i, j, k - cdef int dist + cdef Py_ssize_t i cdef Py_ssize_t n = a.shape[0] cdef Py_ssize_t m = b.shape[0] cdef Py_ssize_t width = a.shape[1] - - # Allocate distance array - distance = np.zeros((n, m), dtype=np.uint32) - - # Create a typed memoryview - cdef uint32_t[:, :] distance_view = distance - - # Ensure contiguous + if not a.flags.c_contiguous or not b.flags.c_contiguous: raise ValueError("Input arrays must be C-contiguous") - # Create typed memoryviews - cdef uint64_t[:, :] a_view = a - cdef uint64_t[:, :] b_view = b + cdef np.ndarray[np.uint32_t, ndim=2, mode='c'] distance = np.zeros((n, m), dtype=np.uint32) + cdef np.ndarray[np.uint64_t, ndim=1] a_row + cdef np.ndarray[np.uint64_t, ndim=2] xor_result + cdef np.ndarray[np.uint8_t, ndim=2] popcounts + cdef np.ndarray[np.uint32_t, ndim=1] distances_i for i in range(n): - for j in range(m): - dist = 0 - for k in range(width): - dist += popcount(a_view[i, k] ^ b_view[j, k]) - distance_view[i, j] = dist + a_row = a[i, :] + + # XOR 'a_row' and all rows in 'b' + xor_result = np.bitwise_xor(a_row[np.newaxis, :], b) + + # Compute popcounts + popcounts = np.bitwise_count(xor_result) + + # Sum to get Hamming distance + distances_i = np.sum(popcounts, axis=1, dtype=np.uint32) + distance[i, :] = distances_i return distance From eb62bb4025e1cb31360ccac55adf901bbbaf7d32 Mon Sep 17 00:00:00 2001 From: Lee Miller Date: Sun, 27 Oct 2024 15:16:35 -0600 Subject: [PATCH 3/7] fix pooling dim --- tests/test_functional.py | 10 +++++++++- wordllama/inference.py | 13 ++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 7e73df5..542d441 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -3,7 +3,15 @@ class TestFunctional(unittest.TestCase): - def test_function_clustering(self): wl = WordLlama.load() wl.cluster(["a", "b"], k=2) + + def test_function_similarity(self): + wl = WordLlama.load() + wl.similarity("a", "b") + + def test_function_similarity_binary(self): + wl = WordLlama.load() + wl.binary = True + wl.similarity("a", "b") diff --git a/wordllama/inference.py b/wordllama/inference.py index bf4ac3c..ff2f84f 100644 --- a/wordllama/inference.py +++ b/wordllama/inference.py @@ -94,7 +94,10 @@ def embed( num_texts = len(texts) embedding_dim = self.embedding.shape[1] np_type = np.float32 if not self.binary else np.uint64 - pooled_embeddings = np.empty((num_texts, embedding_dim), dtype=np_type) + pooled_embeddings = np.empty( + (num_texts, embedding_dim if not self.binary else embedding_dim // 64), + dtype=np_type, + ) for i in range(0, num_texts, batch_size): chunk = texts[i : i + batch_size] @@ -209,10 +212,10 @@ def rank( def deduplicate( self, - docs: List[str], - threshold: float = 0.9, - return_indices: bool = False, - batch_size: Optional[int] = None + docs: List[str], + threshold: float = 0.9, + return_indices: bool = False, + batch_size: Optional[int] = None, ) -> List[Union[str, int]]: """Deduplicate documents based on a similarity threshold. From ab1bc754b647381c388c676a35fbe573d78c2c47 Mon Sep 17 00:00:00 2001 From: Lee Miller Date: Sun, 27 Oct 2024 17:36:48 -0600 Subject: [PATCH 4/7] updating macros --- setup.py | 9 +- wordllama/algorithms/bm25_scorer.pyx | 44 +++++ wordllama/algorithms/deduplicate_helpers.pyx | 1 - wordllama/algorithms/kmeans.pyx | 1 - wordllama/algorithms/ragfile.pyx | 119 +++++++++++++ wordllama/algorithms/topic_model.pyx | 173 +++++++++++++++++++ wordllama/algorithms/vector_similarity.pyx | 1 - 7 files changed, 341 insertions(+), 7 deletions(-) create mode 100644 wordllama/algorithms/bm25_scorer.pyx create mode 100644 wordllama/algorithms/ragfile.pyx create mode 100644 wordllama/algorithms/topic_model.pyx diff --git a/setup.py b/setup.py index cee6f44..161beee 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ extra_compile_args = ["-O3", "-ffast-math"] extra_link_args = [] +define_macros = [("NPY_NO_DEPRECATED_API", "NPY_2_0_API_VERSION")] extensions = [ Extension( @@ -19,7 +20,7 @@ "wordllama.algorithms.deduplicate_helpers", ["wordllama/algorithms/deduplicate_helpers.pyx"], include_dirs=[numpy_include], - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], + define_macros=define_macros, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, ), @@ -27,7 +28,7 @@ "wordllama.algorithms.kmeans", ["wordllama/algorithms/kmeans.pyx"], include_dirs=[numpy_include], - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], + define_macros=define_macros, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, ), @@ -44,7 +45,7 @@ "wordllama.algorithms.find_local_minima", ["wordllama/algorithms/find_local_minima.pyx"], include_dirs=[numpy_include], - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], + define_macros=define_macros, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, language="c++", @@ -53,7 +54,7 @@ "wordllama.algorithms.vector_similarity", ["wordllama/algorithms/vector_similarity.pyx"], include_dirs=[numpy_include], - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], + define_macros=define_macros, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, ), diff --git a/wordllama/algorithms/bm25_scorer.pyx b/wordllama/algorithms/bm25_scorer.pyx new file mode 100644 index 0000000..7f5c79e --- /dev/null +++ b/wordllama/algorithms/bm25_scorer.pyx @@ -0,0 +1,44 @@ +import numpy as np +cimport numpy as np +from cython cimport boundscheck, wraparound + +# Memoryviews allow fast access to arrays +@boundscheck(False) # Disable bounds checking for performance +@wraparound(False) # Disable negative index wraparound for performance +cpdef void optimized_search( + np.ndarray[np.int32_t, ndim=1] query_idx, + np.ndarray[np.float32_t, ndim=2] similarity_matrix, + np.ndarray[np.float32_t, ndim=1] idf_vector, + np.ndarray[np.float32_t, ndim=1] scores, + list tokenized_texts, + float k1, float b, float avg_doc_len, int top_k): + + cdef int i, j, doc_length + cdef float alpha, beta, fq_sum, score_sum + cdef np.ndarray[np.int32_t, ndim=1] doc_idx + cdef np.ndarray[np.float32_t, ndim=1] fq + cdef np.ndarray[np.int32_t, ndim=2] mesh1, mesh2 + cdef np.ndarray[np.float32_t, ndim=1] temp_score + + alpha = k1 + 1 + + # Loop through the documents (tokenized_texts) + for i in range(len(tokenized_texts)): + doc = tokenized_texts[i] + doc_length = np.sum(doc.attention_mask) # Fast sum via NumPy + beta = k1 * (1 - b + b * (doc_length / avg_doc_len)) + + doc_idx = np.array([x for x in doc.ids if x > 0], dtype=np.int32) + + # Meshgrid to compute query-document term interactions + mesh1, mesh2 = np.meshgrid(query_idx, doc_idx, indexing='ij') + + # Access similarity_matrix via memoryviews + fq = similarity_matrix[mesh1, mesh2].sum(axis=0) + + # Reshape idf_vector[query_idx] to enable broadcasting + temp_score = (idf_vector[query_idx].reshape((-1, 1)) * ((fq * alpha) / (fq + beta))).sum(axis=0) + + # Sum the score for the document + scores[i] = np.sum(temp_score) + diff --git a/wordllama/algorithms/deduplicate_helpers.pyx b/wordllama/algorithms/deduplicate_helpers.pyx index 52de559..6b9e805 100644 --- a/wordllama/algorithms/deduplicate_helpers.pyx +++ b/wordllama/algorithms/deduplicate_helpers.pyx @@ -1,5 +1,4 @@ # cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True -# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION import numpy as np cimport numpy as np diff --git a/wordllama/algorithms/kmeans.pyx b/wordllama/algorithms/kmeans.pyx index 5d285bd..3a48a2e 100644 --- a/wordllama/algorithms/kmeans.pyx +++ b/wordllama/algorithms/kmeans.pyx @@ -1,5 +1,4 @@ # cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True, fastmath=True -# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION import numpy as np from numpy.random import RandomState diff --git a/wordllama/algorithms/ragfile.pyx b/wordllama/algorithms/ragfile.pyx new file mode 100644 index 0000000..cd6cada --- /dev/null +++ b/wordllama/algorithms/ragfile.pyx @@ -0,0 +1,119 @@ +# wordllama/algorithms/ragfile.pyx + +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.string cimport memcmp, memset, memcpy +from libc.time cimport time +from libc.stdlib cimport malloc, free +import uuid +import numpy as np +cimport numpy as np + +# Declare the magic number array +cdef uint8_t MAGIC_NUMBER[4] + +# Set the magic number manually +MAGIC_NUMBER[0] = 0x52 # 'R' +MAGIC_NUMBER[1] = 0x41 # 'A' +MAGIC_NUMBER[2] = 0x47 # 'G' +MAGIC_NUMBER[3] = 0x01 # '\x01' + +cdef const uint16_t HEADER_VERSION = 1 + +cdef struct Header: + uint8_t magic_number[4] + uint16_t version + uint16_t flags + uint32_t vector_dim + uint8_t file_hash[32] + uint8_t is_binary + uint8_t model_id_hash[32] + uint64_t data_size + char data_format[16] + uint64_t timestamp + uint8_t uuid[16] + uint32_t header_checksum + +# Function to calculate UUID +cdef void calculate_uuid(uint8_t* uuid_bytes): + cdef uuid.UUID u = uuid.uuid4() + memcpy(uuid_bytes, u.bytes, 16) + +# Function to calculate checksum +cdef uint32_t calculate_checksum(Header* header): + cdef uint32_t checksum = 0 + cdef uint8_t* data = header + cdef int i + for i in range(sizeof(Header) - 4): # Exclude the checksum field itself + checksum += data[i] + return checksum + +# Function to create the header +cdef Header* create_header(uint32_t vector_dim, uint8_t is_binary, char* model_id_hash, uint64_t data_size, char* data_format): + cdef Header* header = malloc(sizeof(Header)) + if not header: + raise MemoryError("Failed to allocate memory for Header") + + memcpy(header.magic_number, MAGIC_NUMBER, 4) + header.version = HEADER_VERSION + header.flags = 0 + header.vector_dim = vector_dim + memset(header.file_hash, 0, 32) # You can later fill this with an actual hash + header.is_binary = is_binary + memcpy(header.model_id_hash, model_id_hash, 32) + header.data_size = data_size + memcpy(header.data_format, data_format, 16) + header.timestamp = time(NULL) + calculate_uuid(header.uuid) + header.header_checksum = calculate_checksum(header) + + return header + +# Function to write the ragfile +cdef void write_ragfile(const char* filename, Header* header, np.ndarray[np.uint8_t, ndim=1] embeddings, np.ndarray[np.uint8_t, ndim=1] binary_data): + cdef FILE* f = fopen(filename, "wb") + if not f: + raise IOError("Failed to open file for writing") + + # Write the static header + fwrite(header, sizeof(Header), 1, f) + + # Write the embeddings + fwrite(embeddings.data, 1, embeddings.nbytes, f) + + # Calculate and write the padding + cdef int padding_size = 4096 - (sizeof(Header) + embeddings.nbytes) + cdef char padding[4096] + memset(padding, 0, padding_size) + fwrite(padding, 1, padding_size, f) + + # Write the binary data + fwrite(binary_data.data, 1, binary_data.nbytes, f) + + fclose(f) + +# Function to read the string from the ragfile +cdef char* read_string_from_ragfile(const char* filename, Header* header): + cdef FILE* f = fopen(filename, "rb") + if not f: + raise IOError("Failed to open file for reading") + + # Skip the header and padding + fseek(f, 4096, SEEK_SET) + + # Allocate space for the string + cdef char* string_data = malloc(header.data_size) + if not string_data: + fclose(f) + raise MemoryError("Failed to allocate memory for string data") + + fread(string_data, header.data_size, 1, f) + fclose(f) + + return string_data + +# Function to deallocate the header +cdef void deallocate_header(Header* header): + if header: + free(header) + diff --git a/wordllama/algorithms/topic_model.pyx b/wordllama/algorithms/topic_model.pyx new file mode 100644 index 0000000..7d724cd --- /dev/null +++ b/wordllama/algorithms/topic_model.pyx @@ -0,0 +1,173 @@ +# topic_model.pyx + +# Cython directives for optimization +# Disable bounds checking and wraparound for speed +# Ensure that array accesses are safe +# These can also be set in setup.py if preferred +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True + +import cython +from cython cimport boundscheck, wraparound, cdivision +import numpy as np +cimport numpy as np + +from collections import Counter +from itertools import islice +import tqdm + +# Type definitions for clarity and performance +ctypedef np.float64_t FLOAT +ctypedef np.int64_t INT + +# Function to generate n-grams (remains in Python for flexibility) +def generate_ngrams(token_ids, n=4): + """ + Generate n-grams from the list of token ids. + + Parameters: + - token_ids: List of token IDs. + - n: The number of tokens in each n-gram. + + Returns: + - A generator of n-grams. + """ + return zip(*(islice(token_ids, i, None) for i in range(n))) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef list combine_overlapping_ngrams_cython(list ngrams_with_scores, int n, int k): + """ + Combine overlapping n-grams within the top-k window. + + Overlapping is defined as sharing (len(ngram) - 1) tokens. + + Parameters: + - ngrams_with_scores: List of tuples ((ngram), score) sorted by score descending. + - n: The length of the n-grams. + - k: The desired number of combined n-grams. + + Returns: + - List of combined n-grams with their aggregated scores. + """ + cdef list combined_ngrams = [] + cdef set used_token_ids = set() + cdef int i, j, overlap, idx + cdef tuple ngram + cdef float score + cdef tuple existing_ngram + cdef float existing_score + cdef tuple merged_ngram + cdef float merged_score + cdef int token_id + cdef bint overlap_flag + + for i in range(len(ngrams_with_scores)): + if len(combined_ngrams) >= k: + break + ngram, score = ngrams_with_scores[i] + + # Check if any token_id is already used + overlap_flag = False + for token_id in ngram: + if token_id in used_token_ids: + overlap_flag = True + break + + if not overlap_flag: + combined_ngrams.append((ngram, score)) + for token_id in ngram: + used_token_ids.add(token_id) + else: + # Attempt to merge with existing n-grams + for idx in range(len(combined_ngrams)): + existing_ngram, existing_score = combined_ngrams[idx] + overlap = 0 + for j in range(1, n): + # Replace negative indices with positive indices + # existing_ngram[-j:] -> existing_ngram[n - j :] + # ngram[:j] remains the same + if existing_ngram[n - j :] == ngram[:j]: + overlap = j + if overlap == n - 1: + # Merge the n-grams + # Replace ngram[-1] with ngram[n - 1] + merged_ngram = existing_ngram + (ngram[n - 1],) + merged_score = existing_score + score # Aggregation method + combined_ngrams[idx] = (merged_ngram, merged_score) + used_token_ids.add(ngram[n - 1]) + break + + return combined_ngrams[:k] + + +def top_k_token_ngrams(texts, wl, int k=10, int n=3): + """ + Extract the top-k non-overlapping n-grams from the texts. + + Parameters: + - texts: List of texts (each text is a string). + - wl: Language model with tokenizer and embeddings. + - k: Number of top n-grams to return. + - n: The number of tokens in each n-gram. + + Returns: + - List of top-k n-grams with their scores. + """ + # Use Python's Counter since it's optimized and efficient + trigram_counter = Counter() + # Ensure wl.embedding is a NumPy array of type float64 + magnitudes_np = np.linalg.norm(wl.embedding, axis=1, keepdims=True) + # Cast to float64 to match FLOAT + magnitudes = magnitudes_np.astype(np.float64) + cdef np.ndarray[FLOAT, ndim=2] magnitudes_c = magnitudes + + # Iterate over each tokenized text (list of token ids) + for batch in tqdm.tqdm(texts, desc="Processing texts"): + tokenized_text = wl.tokenize([batch]) + for x in tokenized_text: + ngrams = generate_ngrams(x.ids, n) + trigram_counter.update(ngrams) + + # Get the top 10 * k most common n-grams + ngrams = trigram_counter.most_common(10 * k) + importances = [] + counts = [] + cdef tuple ngram + cdef int count + cdef float importance + cdef int i + + for ngram, count in ngrams: + importance = 0.0 + for token_id in ngram: + importance += magnitudes_c[token_id, 0] + importances.append(importance) + counts.append(count) + + iar = np.array(importances, dtype=np.float64) + counts_arr = np.array(counts, dtype=np.float64) + + # Compute scores in Python using NumPy's optimized functions + scores = [] + sorted_iar = np.sort(iar) + sorted_counts = np.sort(counts_arr) + for i in range(len(ngrams)): + p0 = np.searchsorted(sorted_iar, iar[i], side='right') / len(iar) + p1 = np.searchsorted(sorted_counts, counts_arr[i], side='right') / len(counts_arr) + score = (p0 + p1) / 2.0 + scores.append(score) + + # Combine ngrams with their scores + ngrams_with_scores = list(zip([ngram for ngram, _ in ngrams], scores)) + + # Sort ngrams by score in descending order + ngrams_with_scores.sort(key=lambda x: x[1], reverse=True) + + # Combine overlapping ngrams using the Cython-optimized function + combined_ngrams = combine_overlapping_ngrams_cython(ngrams_with_scores, n, k) + + return combined_ngrams + diff --git a/wordllama/algorithms/vector_similarity.pyx b/wordllama/algorithms/vector_similarity.pyx index 2bc4e18..232dfea 100644 --- a/wordllama/algorithms/vector_similarity.pyx +++ b/wordllama/algorithms/vector_similarity.pyx @@ -1,5 +1,4 @@ # cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True, nonecheck=False -# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION import numpy as np cimport numpy as np From 57a904818ed3452ceaf1ac24c70cee291c7fb167 Mon Sep 17 00:00:00 2001 From: Lee Miller <80222060+dleemiller@users.noreply.github.com> Date: Sun, 27 Oct 2024 19:33:38 -0600 Subject: [PATCH 5/7] Delete wordllama/algorithms/bm25_scorer.pyx --- wordllama/algorithms/bm25_scorer.pyx | 44 ---------------------------- 1 file changed, 44 deletions(-) delete mode 100644 wordllama/algorithms/bm25_scorer.pyx diff --git a/wordllama/algorithms/bm25_scorer.pyx b/wordllama/algorithms/bm25_scorer.pyx deleted file mode 100644 index 7f5c79e..0000000 --- a/wordllama/algorithms/bm25_scorer.pyx +++ /dev/null @@ -1,44 +0,0 @@ -import numpy as np -cimport numpy as np -from cython cimport boundscheck, wraparound - -# Memoryviews allow fast access to arrays -@boundscheck(False) # Disable bounds checking for performance -@wraparound(False) # Disable negative index wraparound for performance -cpdef void optimized_search( - np.ndarray[np.int32_t, ndim=1] query_idx, - np.ndarray[np.float32_t, ndim=2] similarity_matrix, - np.ndarray[np.float32_t, ndim=1] idf_vector, - np.ndarray[np.float32_t, ndim=1] scores, - list tokenized_texts, - float k1, float b, float avg_doc_len, int top_k): - - cdef int i, j, doc_length - cdef float alpha, beta, fq_sum, score_sum - cdef np.ndarray[np.int32_t, ndim=1] doc_idx - cdef np.ndarray[np.float32_t, ndim=1] fq - cdef np.ndarray[np.int32_t, ndim=2] mesh1, mesh2 - cdef np.ndarray[np.float32_t, ndim=1] temp_score - - alpha = k1 + 1 - - # Loop through the documents (tokenized_texts) - for i in range(len(tokenized_texts)): - doc = tokenized_texts[i] - doc_length = np.sum(doc.attention_mask) # Fast sum via NumPy - beta = k1 * (1 - b + b * (doc_length / avg_doc_len)) - - doc_idx = np.array([x for x in doc.ids if x > 0], dtype=np.int32) - - # Meshgrid to compute query-document term interactions - mesh1, mesh2 = np.meshgrid(query_idx, doc_idx, indexing='ij') - - # Access similarity_matrix via memoryviews - fq = similarity_matrix[mesh1, mesh2].sum(axis=0) - - # Reshape idf_vector[query_idx] to enable broadcasting - temp_score = (idf_vector[query_idx].reshape((-1, 1)) * ((fq * alpha) / (fq + beta))).sum(axis=0) - - # Sum the score for the document - scores[i] = np.sum(temp_score) - From 6c96c32aadb90ba5148419b297d6e3085cfc53b4 Mon Sep 17 00:00:00 2001 From: Lee Miller <80222060+dleemiller@users.noreply.github.com> Date: Sun, 27 Oct 2024 19:33:54 -0600 Subject: [PATCH 6/7] Delete wordllama/algorithms/ragfile.pyx --- wordllama/algorithms/ragfile.pyx | 119 ------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 wordllama/algorithms/ragfile.pyx diff --git a/wordllama/algorithms/ragfile.pyx b/wordllama/algorithms/ragfile.pyx deleted file mode 100644 index cd6cada..0000000 --- a/wordllama/algorithms/ragfile.pyx +++ /dev/null @@ -1,119 +0,0 @@ -# wordllama/algorithms/ragfile.pyx - -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.string cimport memcmp, memset, memcpy -from libc.time cimport time -from libc.stdlib cimport malloc, free -import uuid -import numpy as np -cimport numpy as np - -# Declare the magic number array -cdef uint8_t MAGIC_NUMBER[4] - -# Set the magic number manually -MAGIC_NUMBER[0] = 0x52 # 'R' -MAGIC_NUMBER[1] = 0x41 # 'A' -MAGIC_NUMBER[2] = 0x47 # 'G' -MAGIC_NUMBER[3] = 0x01 # '\x01' - -cdef const uint16_t HEADER_VERSION = 1 - -cdef struct Header: - uint8_t magic_number[4] - uint16_t version - uint16_t flags - uint32_t vector_dim - uint8_t file_hash[32] - uint8_t is_binary - uint8_t model_id_hash[32] - uint64_t data_size - char data_format[16] - uint64_t timestamp - uint8_t uuid[16] - uint32_t header_checksum - -# Function to calculate UUID -cdef void calculate_uuid(uint8_t* uuid_bytes): - cdef uuid.UUID u = uuid.uuid4() - memcpy(uuid_bytes, u.bytes, 16) - -# Function to calculate checksum -cdef uint32_t calculate_checksum(Header* header): - cdef uint32_t checksum = 0 - cdef uint8_t* data = header - cdef int i - for i in range(sizeof(Header) - 4): # Exclude the checksum field itself - checksum += data[i] - return checksum - -# Function to create the header -cdef Header* create_header(uint32_t vector_dim, uint8_t is_binary, char* model_id_hash, uint64_t data_size, char* data_format): - cdef Header* header = malloc(sizeof(Header)) - if not header: - raise MemoryError("Failed to allocate memory for Header") - - memcpy(header.magic_number, MAGIC_NUMBER, 4) - header.version = HEADER_VERSION - header.flags = 0 - header.vector_dim = vector_dim - memset(header.file_hash, 0, 32) # You can later fill this with an actual hash - header.is_binary = is_binary - memcpy(header.model_id_hash, model_id_hash, 32) - header.data_size = data_size - memcpy(header.data_format, data_format, 16) - header.timestamp = time(NULL) - calculate_uuid(header.uuid) - header.header_checksum = calculate_checksum(header) - - return header - -# Function to write the ragfile -cdef void write_ragfile(const char* filename, Header* header, np.ndarray[np.uint8_t, ndim=1] embeddings, np.ndarray[np.uint8_t, ndim=1] binary_data): - cdef FILE* f = fopen(filename, "wb") - if not f: - raise IOError("Failed to open file for writing") - - # Write the static header - fwrite(header, sizeof(Header), 1, f) - - # Write the embeddings - fwrite(embeddings.data, 1, embeddings.nbytes, f) - - # Calculate and write the padding - cdef int padding_size = 4096 - (sizeof(Header) + embeddings.nbytes) - cdef char padding[4096] - memset(padding, 0, padding_size) - fwrite(padding, 1, padding_size, f) - - # Write the binary data - fwrite(binary_data.data, 1, binary_data.nbytes, f) - - fclose(f) - -# Function to read the string from the ragfile -cdef char* read_string_from_ragfile(const char* filename, Header* header): - cdef FILE* f = fopen(filename, "rb") - if not f: - raise IOError("Failed to open file for reading") - - # Skip the header and padding - fseek(f, 4096, SEEK_SET) - - # Allocate space for the string - cdef char* string_data = malloc(header.data_size) - if not string_data: - fclose(f) - raise MemoryError("Failed to allocate memory for string data") - - fread(string_data, header.data_size, 1, f) - fclose(f) - - return string_data - -# Function to deallocate the header -cdef void deallocate_header(Header* header): - if header: - free(header) - From aac96c18f4a231e79c7c557b531275fff6e8ce56 Mon Sep 17 00:00:00 2001 From: Lee Miller <80222060+dleemiller@users.noreply.github.com> Date: Sun, 27 Oct 2024 19:34:07 -0600 Subject: [PATCH 7/7] Delete wordllama/algorithms/topic_model.pyx --- wordllama/algorithms/topic_model.pyx | 173 --------------------------- 1 file changed, 173 deletions(-) delete mode 100644 wordllama/algorithms/topic_model.pyx diff --git a/wordllama/algorithms/topic_model.pyx b/wordllama/algorithms/topic_model.pyx deleted file mode 100644 index 7d724cd..0000000 --- a/wordllama/algorithms/topic_model.pyx +++ /dev/null @@ -1,173 +0,0 @@ -# topic_model.pyx - -# Cython directives for optimization -# Disable bounds checking and wraparound for speed -# Ensure that array accesses are safe -# These can also be set in setup.py if preferred -# cython: boundscheck=False -# cython: wraparound=False -# cython: cdivision=True - -import cython -from cython cimport boundscheck, wraparound, cdivision -import numpy as np -cimport numpy as np - -from collections import Counter -from itertools import islice -import tqdm - -# Type definitions for clarity and performance -ctypedef np.float64_t FLOAT -ctypedef np.int64_t INT - -# Function to generate n-grams (remains in Python for flexibility) -def generate_ngrams(token_ids, n=4): - """ - Generate n-grams from the list of token ids. - - Parameters: - - token_ids: List of token IDs. - - n: The number of tokens in each n-gram. - - Returns: - - A generator of n-grams. - """ - return zip(*(islice(token_ids, i, None) for i in range(n))) - - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef list combine_overlapping_ngrams_cython(list ngrams_with_scores, int n, int k): - """ - Combine overlapping n-grams within the top-k window. - - Overlapping is defined as sharing (len(ngram) - 1) tokens. - - Parameters: - - ngrams_with_scores: List of tuples ((ngram), score) sorted by score descending. - - n: The length of the n-grams. - - k: The desired number of combined n-grams. - - Returns: - - List of combined n-grams with their aggregated scores. - """ - cdef list combined_ngrams = [] - cdef set used_token_ids = set() - cdef int i, j, overlap, idx - cdef tuple ngram - cdef float score - cdef tuple existing_ngram - cdef float existing_score - cdef tuple merged_ngram - cdef float merged_score - cdef int token_id - cdef bint overlap_flag - - for i in range(len(ngrams_with_scores)): - if len(combined_ngrams) >= k: - break - ngram, score = ngrams_with_scores[i] - - # Check if any token_id is already used - overlap_flag = False - for token_id in ngram: - if token_id in used_token_ids: - overlap_flag = True - break - - if not overlap_flag: - combined_ngrams.append((ngram, score)) - for token_id in ngram: - used_token_ids.add(token_id) - else: - # Attempt to merge with existing n-grams - for idx in range(len(combined_ngrams)): - existing_ngram, existing_score = combined_ngrams[idx] - overlap = 0 - for j in range(1, n): - # Replace negative indices with positive indices - # existing_ngram[-j:] -> existing_ngram[n - j :] - # ngram[:j] remains the same - if existing_ngram[n - j :] == ngram[:j]: - overlap = j - if overlap == n - 1: - # Merge the n-grams - # Replace ngram[-1] with ngram[n - 1] - merged_ngram = existing_ngram + (ngram[n - 1],) - merged_score = existing_score + score # Aggregation method - combined_ngrams[idx] = (merged_ngram, merged_score) - used_token_ids.add(ngram[n - 1]) - break - - return combined_ngrams[:k] - - -def top_k_token_ngrams(texts, wl, int k=10, int n=3): - """ - Extract the top-k non-overlapping n-grams from the texts. - - Parameters: - - texts: List of texts (each text is a string). - - wl: Language model with tokenizer and embeddings. - - k: Number of top n-grams to return. - - n: The number of tokens in each n-gram. - - Returns: - - List of top-k n-grams with their scores. - """ - # Use Python's Counter since it's optimized and efficient - trigram_counter = Counter() - # Ensure wl.embedding is a NumPy array of type float64 - magnitudes_np = np.linalg.norm(wl.embedding, axis=1, keepdims=True) - # Cast to float64 to match FLOAT - magnitudes = magnitudes_np.astype(np.float64) - cdef np.ndarray[FLOAT, ndim=2] magnitudes_c = magnitudes - - # Iterate over each tokenized text (list of token ids) - for batch in tqdm.tqdm(texts, desc="Processing texts"): - tokenized_text = wl.tokenize([batch]) - for x in tokenized_text: - ngrams = generate_ngrams(x.ids, n) - trigram_counter.update(ngrams) - - # Get the top 10 * k most common n-grams - ngrams = trigram_counter.most_common(10 * k) - importances = [] - counts = [] - cdef tuple ngram - cdef int count - cdef float importance - cdef int i - - for ngram, count in ngrams: - importance = 0.0 - for token_id in ngram: - importance += magnitudes_c[token_id, 0] - importances.append(importance) - counts.append(count) - - iar = np.array(importances, dtype=np.float64) - counts_arr = np.array(counts, dtype=np.float64) - - # Compute scores in Python using NumPy's optimized functions - scores = [] - sorted_iar = np.sort(iar) - sorted_counts = np.sort(counts_arr) - for i in range(len(ngrams)): - p0 = np.searchsorted(sorted_iar, iar[i], side='right') / len(iar) - p1 = np.searchsorted(sorted_counts, counts_arr[i], side='right') / len(counts_arr) - score = (p0 + p1) / 2.0 - scores.append(score) - - # Combine ngrams with their scores - ngrams_with_scores = list(zip([ngram for ngram, _ in ngrams], scores)) - - # Sort ngrams by score in descending order - ngrams_with_scores.sort(key=lambda x: x[1], reverse=True) - - # Combine overlapping ngrams using the Cython-optimized function - combined_ngrams = combine_overlapping_ngrams_cython(ngrams_with_scores, n, k) - - return combined_ngrams -