Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Globbing based scraping #26

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ markdownify
jwt
sentence-transformers
marker-pdf
modal
modal
scikit-learn
2 changes: 1 addition & 1 deletion thepipe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def main() -> None:
if args.source.startswith("http"):
chunks = scrape_url(args.source, text_only=args.text_only, ai_extraction=args.ai_extraction, verbose=args.verbose, local=args.local)
elif os.path.isdir(args.source):
chunks = scrape_directory(args.source, include_regex=args.include_regex, text_only=args.text_only, ai_extraction=args.ai_extraction, verbose=args.verbose, local=args.local)
chunks = scrape_directory(args.source, include_regex=args.include_regex,include_pattern=args.include_pattern,verbose=args.verbose,ai_extraction=args.ai_extraction,text_only=args.text_only,local=args.local)
else:
chunks = scrape_file(args.source, text_only=args.text_only, ai_extraction=args.ai_extraction, verbose=args.verbose, local=args.local)
save_outputs(chunks=chunks, verbose=args.verbose, text_only=args.text_only)
Expand Down
54 changes: 45 additions & 9 deletions thepipe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import requests
from PIL import Image
from llama_index.core.schema import Document, ImageDocument
import weakref

HOST_IMAGES = os.getenv("HOST_IMAGES", "false").lower() == "true"
HOST_URL = os.getenv("THEPIPE_API_URL", "https://thepipe-api.up.railway.app")
Expand All @@ -18,10 +19,18 @@ class Chunk:
def __init__(self, path: Optional[str] = None, texts: Optional[List[str]] = [], images: Optional[List[Image.Image]] = [], audios: Optional[List] = [], videos: Optional[List] = []):
self.path = path
self.texts = texts
self.images = images
self.images = []
for img in (images or []):
if isinstance(img, weakref.ReferenceType):
self.images.append(img)
else:
self.images.append(weakref.ref(img))
self.audios = audios
self.videos = videos

def get_valid_images(self):
return [img() for img in self.images if img() is not None]

def to_llamaindex(self) -> List[Union[Document, ImageDocument]]:
document_text = "\n".join(self.texts)
if len(self.images) > 0:
Expand Down Expand Up @@ -90,6 +99,9 @@ def from_json(data: Dict, host_images: bool = False) -> 'Chunk':
videos=data['videos'],
)

def __repr__(self):
return f"Chunk(path={self.path}, texts={len(self.texts)} items, images={len(self.images)} items)"

def make_image_url(image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None) -> str:
if max_resolution:
width, height = image.size
Expand Down Expand Up @@ -137,9 +149,14 @@ def calculate_tokens(chunks: List[Chunk]) -> int:
n_tokens = 0
for chunk in chunks:
for text in chunk.texts:
n_tokens += len(text) / 4
for image in chunk.images:
n_tokens += calculate_image_tokens(image)
n_tokens += len(text) // 4 # Rough estimate: 1 token ≈ 4 characters
for image in chunk.get_valid_images():
try:
n_tokens += calculate_image_tokens(image)
except Exception as e:
print(f"[thepipe] Error calculating tokens for an image: {str(e)}")
# Add a default token count for failed images
n_tokens += 85 # Minimum token count for an image
return int(n_tokens)

def chunks_to_messages(chunks: List[Chunk]) -> List[Dict]:
Expand All @@ -160,23 +177,42 @@ def save_outputs(chunks: List[Chunk], verbose: bool = False, text_only: bool = F
for chunk_text in chunk.texts:
text += f'```\n{chunk_text}\n```\n'
if chunk.images and not text_only:
for j, image in enumerate(chunk.images):
image.convert('RGB').save(f'outputs/{i}_{j}.jpg')
for j, image in enumerate(chunk.get_valid_images()):
try:
image.convert('RGB').save(f'outputs/{i}_{j}.jpg')
except Exception as e:
if verbose:
print(f"[thepipe] Error saving image at index {j} in chunk {i}: {str(e)}")

# Save the text
with open('outputs/prompt.txt', 'w', encoding='utf-8') as file:
file.write(text)

if verbose:
print(f"[thepipe] {calculate_tokens(chunks)} tokens saved to outputs folder")
try:
# Attempt to calculate tokens using the original method
token_count = calculate_tokens(chunks)
print(f"[thepipe] Approximately {token_count} tokens saved to outputs folder")
except Exception as e:
# If the original method fails, fall back to a simpler estimation
total_chars = sum(len(chunk_text) for chunk in chunks for chunk_text in chunk.texts)
estimated_tokens = total_chars // 4 # Rough estimate: 1 token ≈ 4 characters
print(f"[thepipe] Error calculating exact tokens: {str(e)}")
print(f"[thepipe] Estimated {estimated_tokens} tokens saved to outputs folder (based on character count)")

print(f"[thepipe] Outputs saved to 'outputs' folder")

def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Compress project files into a context prompt.')
parser.add_argument('source', type=str, help='The source file or directory to compress.')
parser.add_argument('--include_regex', type=str, default=None, help='Regex pattern to match in a directory.')
group = parser.add_mutually_exclusive_group()
group.add_argument('--include_regex', type=str, nargs='?', const='.*', default=None,
help='Regex pattern to match in a directory. Use quotes for patterns with special characters.')
group.add_argument('--include_pattern', type=str, nargs='?', const='*', default=None,
help='Glob pattern to match files in a directory (e.g., "*.tsx"). Use quotes for patterns with special characters.')
parser.add_argument('--ai_extraction', action='store_true', help='Use ai_extraction to extract text from images.')
parser.add_argument('--text_only', action='store_true', help='Extract only text from the source.')
parser.add_argument('--verbose', action='store_true', help='Print status messages.')
parser.add_argument('--local', action='store_true', help='Print status messages.')
parser.add_argument('--local', action='store_true', help='Use local processing instead of API.')
args = parser.parse_args()
return args
69 changes: 54 additions & 15 deletions thepipe/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,23 +130,56 @@ def scrape_plaintext(file_path: str) -> List[Chunk]:
text = file.read()
return [Chunk(path=file_path, texts=[text])]

def scrape_directory(dir_path: str, include_regex: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
def scrape_directory(dir_path: str, include_regex: Optional[str] = None, include_pattern: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
extraction = []
all_files = glob.glob(f'{dir_path}/**/*', recursive=True)
if include_regex:
all_files = [file for file in all_files if re.search(include_regex, file, re.IGNORECASE)]

if include_pattern is not None:
# Use glob pattern
pattern = os.path.join(dir_path, '**', include_pattern)
all_files = glob.glob(pattern, recursive=True)
elif include_regex is not None:
# Use regex
all_files = []
for root, _, files in os.walk(dir_path):
for file in files:
file_path = os.path.join(root, file)
if re.search(include_regex, file_path, re.IGNORECASE):
all_files.append(file_path)
else:
# Neither pattern nor regex specified, include all files
all_files = []
for root, _, files in os.walk(dir_path):
for file in files:
all_files.append(os.path.join(root, file))

# Ensure we're only dealing with files
all_files = [f for f in all_files if os.path.isfile(f)]

if verbose:
print(f"[thepipe] Found {len(all_files)} files to process in {dir_path}")

with ThreadPoolExecutor() as executor:
results = executor.map(lambda file_path: scrape_file(filepath=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local), all_files)
results = executor.map(
lambda file_path: scrape_file(
filepath=file_path,
ai_extraction=ai_extraction,
text_only=text_only,
verbose=verbose,
local=local
),
all_files
)
for result in results:
extraction += result
extraction.extend(result)

return extraction

def scrape_zip(file_path: str, include_regex: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
chunks = []
with tempfile.TemporaryDirectory() as temp_dir:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
chunks = scrape_directory(dir_path=temp_dir, include_regex=include_regex, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
chunks =scrape_directory(dir_path=temp_dir, include_regex=include_regex, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
return chunks

def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False) -> List[Chunk]:
Expand Down Expand Up @@ -648,6 +681,7 @@ def scrape_docx(file_path: str, verbose: bool = False, text_only: bool = False)
from docx.text.paragraph import Paragraph
import csv
import io
import weakref

# helper function to iterate through blocks in the document
def iter_block_items(parent):
Expand Down Expand Up @@ -675,23 +709,22 @@ def read_docx_tables(tab):

# read the document
document = Document(file_path)
chunks = []
image_counter = 0

# Define namespaces
nsmap = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
}
chunks = []
image_counter = 0

try:
# scrape each block in the document to create chunks
# A block can be a paragraph, table, or image
for block in iter_block_items(document):
block_texts = []
block_images = []
if block.__class__.__name__ == 'Paragraph':
if isinstance(block, Paragraph):
block_texts.append(block.text)
if not text_only:
# "runs" are the smallest units in a paragraph
Expand All @@ -710,9 +743,9 @@ def read_docx_tables(tab):
image_data = io.BytesIO(image_part._blob)
image = Image.open(image_data)
image.load()
block_images.append(image)
block_images.append(image) # Append the image directly, not a weak reference
image_counter += 1
elif block.__class__.__name__ == 'Table':
elif isinstance(block, Table):
table_text = read_docx_tables(block)
block_texts.append(table_text)
if block_texts or block_images:
Expand All @@ -721,8 +754,14 @@ def read_docx_tables(tab):
finally:
# Close any open image files
for chunk in chunks:
for image in chunk.images:
image.close()
for img_ref in chunk.images:
img = img_ref() if isinstance(img_ref, weakref.ReferenceType) else img_ref
if img is not None:
try:
img.close()
except Exception as e:
if verbose:
print(f"[thepipe] Error closing image: {str(e)}")

return chunks

Expand Down