emcf · skyler14 · Aug 14, 2024 · Aug 14, 2024
diff --git a/local.txt b/local.txt
@@ -28,4 +28,5 @@ markdownify
 jwt
 sentence-transformers
 marker-pdf
-modal
+modal
+scikit-learn
diff --git a/thepipe/__init__.py b/thepipe/__init__.py
@@ -8,7 +8,7 @@ def main() -> None:
     if args.source.startswith("http"):
         chunks = scrape_url(args.source, text_only=args.text_only, ai_extraction=args.ai_extraction, verbose=args.verbose, local=args.local)
     elif os.path.isdir(args.source):
-        chunks = scrape_directory(args.source, include_regex=args.include_regex, text_only=args.text_only, ai_extraction=args.ai_extraction, verbose=args.verbose, local=args.local)
+        chunks = scrape_directory(args.source, include_regex=args.include_regex,include_pattern=args.include_pattern,verbose=args.verbose,ai_extraction=args.ai_extraction,text_only=args.text_only,local=args.local)
     else:
         chunks = scrape_file(args.source, text_only=args.text_only, ai_extraction=args.ai_extraction, verbose=args.verbose, local=args.local)
     save_outputs(chunks=chunks, verbose=args.verbose, text_only=args.text_only)

diff --git a/thepipe/core.py b/thepipe/core.py
@@ -9,6 +9,7 @@
 import requests
 from PIL import Image
 from llama_index.core.schema import Document, ImageDocument
+import weakref
 
 HOST_IMAGES = os.getenv("HOST_IMAGES", "false").lower() == "true"
 HOST_URL = os.getenv("THEPIPE_API_URL", "https://thepipe-api.up.railway.app")
@@ -18,10 +19,18 @@ class Chunk:
     def __init__(self, path: Optional[str] = None, texts: Optional[List[str]] = [], images: Optional[List[Image.Image]] = [], audios: Optional[List] = [], videos: Optional[List] = []):
         self.path = path
         self.texts = texts
-        self.images = images
+        self.images = []
+        for img in (images or []):
+            if isinstance(img, weakref.ReferenceType):
+                self.images.append(img)
+            else:
+                self.images.append(weakref.ref(img))
         self.audios = audios
         self.videos = videos
 
+    def get_valid_images(self):
+        return [img() for img in self.images if img() is not None]
+
     def to_llamaindex(self) -> List[Union[Document, ImageDocument]]:
         document_text = "\n".join(self.texts)
         if len(self.images) > 0:
@@ -90,6 +99,9 @@ def from_json(data: Dict, host_images: bool = False) -> 'Chunk':
             videos=data['videos'],
         )
 
+    def __repr__(self):
+        return f"Chunk(path={self.path}, texts={len(self.texts)} items, images={len(self.images)} items)"
+
 def make_image_url(image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None) -> str:
     if max_resolution:
         width, height = image.size
@@ -137,9 +149,14 @@ def calculate_tokens(chunks: List[Chunk]) -> int:
     n_tokens = 0
     for chunk in chunks:
         for text in chunk.texts:
-            n_tokens += len(text) / 4
-        for image in chunk.images:
-            n_tokens += calculate_image_tokens(image)
+            n_tokens += len(text) // 4  # Rough estimate: 1 token ≈ 4 characters
+        for image in chunk.get_valid_images():
+            try:
+                n_tokens += calculate_image_tokens(image)
+            except Exception as e:
+                print(f"[thepipe] Error calculating tokens for an image: {str(e)}")
+                # Add a default token count for failed images
+                n_tokens += 85  # Minimum token count for an image
     return int(n_tokens)
 
 def chunks_to_messages(chunks: List[Chunk]) -> List[Dict]:
@@ -160,23 +177,42 @@ def save_outputs(chunks: List[Chunk], verbose: bool = False, text_only: bool = F
             for chunk_text in chunk.texts:
                 text += f'```\n{chunk_text}\n```\n'
         if chunk.images and not text_only:
-            for j, image in enumerate(chunk.images):
-                image.convert('RGB').save(f'outputs/{i}_{j}.jpg')
+            for j, image in enumerate(chunk.get_valid_images()):
+                try:
+                    image.convert('RGB').save(f'outputs/{i}_{j}.jpg')
+                except Exception as e:
+                    if verbose:
+                        print(f"[thepipe] Error saving image at index {j} in chunk {i}: {str(e)}")
 
     # Save the text
     with open('outputs/prompt.txt', 'w', encoding='utf-8') as file:
         file.write(text)
 
     if verbose:
-        print(f"[thepipe] {calculate_tokens(chunks)} tokens saved to outputs folder")
+        try:
+            # Attempt to calculate tokens using the original method
+            token_count = calculate_tokens(chunks)
+            print(f"[thepipe] Approximately {token_count} tokens saved to outputs folder")
+        except Exception as e:
+            # If the original method fails, fall back to a simpler estimation
+            total_chars = sum(len(chunk_text) for chunk in chunks for chunk_text in chunk.texts)
+            estimated_tokens = total_chars // 4  # Rough estimate: 1 token ≈ 4 characters
+            print(f"[thepipe] Error calculating exact tokens: {str(e)}")
+            print(f"[thepipe] Estimated {estimated_tokens} tokens saved to outputs folder (based on character count)")
+
+        print(f"[thepipe] Outputs saved to 'outputs' folder")
 
 def parse_arguments() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description='Compress project files into a context prompt.')
     parser.add_argument('source', type=str, help='The source file or directory to compress.')
-    parser.add_argument('--include_regex', type=str, default=None, help='Regex pattern to match in a directory.')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--include_regex', type=str, nargs='?', const='.*', default=None, 
+                       help='Regex pattern to match in a directory. Use quotes for patterns with special characters.')
+    group.add_argument('--include_pattern', type=str, nargs='?', const='*', default=None, 
+                       help='Glob pattern to match files in a directory (e.g., "*.tsx"). Use quotes for patterns with special characters.')    
     parser.add_argument('--ai_extraction', action='store_true', help='Use ai_extraction to extract text from images.')
     parser.add_argument('--text_only', action='store_true', help='Extract only text from the source.')
     parser.add_argument('--verbose', action='store_true', help='Print status messages.')
-    parser.add_argument('--local', action='store_true', help='Print status messages.')
+    parser.add_argument('--local', action='store_true', help='Use local processing instead of API.')
     args = parser.parse_args()
     return args
diff --git a/thepipe/scraper.py b/thepipe/scraper.py
@@ -130,23 +130,56 @@ def scrape_plaintext(file_path: str) -> List[Chunk]:
         text = file.read()
     return [Chunk(path=file_path, texts=[text])]
 
-def scrape_directory(dir_path: str, include_regex: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
+def scrape_directory(dir_path: str, include_regex: Optional[str] = None, include_pattern: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
     extraction = []
-    all_files = glob.glob(f'{dir_path}/**/*', recursive=True)
-    if include_regex:
-        all_files = [file for file in all_files if re.search(include_regex, file, re.IGNORECASE)]
+
+    if include_pattern is not None:
+        # Use glob pattern
+        pattern = os.path.join(dir_path, '**', include_pattern)
+        all_files = glob.glob(pattern, recursive=True)
+    elif include_regex is not None:
+        # Use regex
+        all_files = []
+        for root, _, files in os.walk(dir_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                if re.search(include_regex, file_path, re.IGNORECASE):
+                    all_files.append(file_path)
+    else:
+        # Neither pattern nor regex specified, include all files
+        all_files = []
+        for root, _, files in os.walk(dir_path):
+            for file in files:
+                all_files.append(os.path.join(root, file))
+
+    # Ensure we're only dealing with files
+    all_files = [f for f in all_files if os.path.isfile(f)]
+
+    if verbose:
+        print(f"[thepipe] Found {len(all_files)} files to process in {dir_path}")
+
     with ThreadPoolExecutor() as executor:
-        results = executor.map(lambda file_path: scrape_file(filepath=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local), all_files)
+        results = executor.map(
+            lambda file_path: scrape_file(
+                filepath=file_path, 
+                ai_extraction=ai_extraction, 
+                text_only=text_only, 
+                verbose=verbose, 
+                local=local
+            ), 
+            all_files
+        )
         for result in results:
-            extraction += result
+            extraction.extend(result)
+
     return extraction
 
 def scrape_zip(file_path: str, include_regex: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
     chunks = []
     with tempfile.TemporaryDirectory() as temp_dir:
         with zipfile.ZipFile(file_path, 'r') as zip_ref:
             zip_ref.extractall(temp_dir)
-        chunks = scrape_directory(dir_path=temp_dir, include_regex=include_regex, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
+        chunks =scrape_directory(dir_path=temp_dir, include_regex=include_regex, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
     return chunks
 
 def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False) -> List[Chunk]:    
@@ -648,6 +681,7 @@ def scrape_docx(file_path: str, verbose: bool = False, text_only: bool = False)
     from docx.text.paragraph import Paragraph
     import csv
     import io
+    import weakref
 
     # helper function to iterate through blocks in the document
     def iter_block_items(parent):
@@ -675,23 +709,22 @@ def read_docx_tables(tab):
 
     # read the document
     document = Document(file_path)
-    chunks = []
-    image_counter = 0
 
     # Define namespaces
     nsmap = {
         'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
         'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
         'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
     }
+    chunks = []
+    image_counter = 0
 
     try:
         # scrape each block in the document to create chunks
-        # A block can be a paragraph, table, or image
         for block in iter_block_items(document):
             block_texts = []
             block_images = []
-            if block.__class__.__name__ == 'Paragraph':
+            if isinstance(block, Paragraph):
                 block_texts.append(block.text)
                 if not text_only:
                     # "runs" are the smallest units in a paragraph
@@ -710,9 +743,9 @@ def read_docx_tables(tab):
                                         image_data = io.BytesIO(image_part._blob)
                                         image = Image.open(image_data)
                                         image.load()
-                                        block_images.append(image)
+                                        block_images.append(image)  # Append the image directly, not a weak reference
                                         image_counter += 1
-            elif block.__class__.__name__ == 'Table':
+            elif isinstance(block, Table):
                 table_text = read_docx_tables(block)
                 block_texts.append(table_text)
             if block_texts or block_images:
@@ -721,8 +754,14 @@ def read_docx_tables(tab):
     finally:
         # Close any open image files
         for chunk in chunks:
-            for image in chunk.images:
-                image.close()
+            for img_ref in chunk.images:
+                img = img_ref() if isinstance(img_ref, weakref.ReferenceType) else img_ref
+                if img is not None:
+                    try:
+                        img.close()
+                    except Exception as e:
+                        if verbose:
+                            print(f"[thepipe] Error closing image: {str(e)}")
 
     return chunks