From 5ae341d454b66c0ab25e279f840912b6f2f14bb2 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Sat, 27 Apr 2024 09:23:14 -0400 Subject: [PATCH] get exact token counts Signed-off-by: Michael Clifford --- .../summarizer/app/requirements.txt | 2 +- .../summarizer/app/summarizer.py | 25 +++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/recipes/natural_language_processing/summarizer/app/requirements.txt b/recipes/natural_language_processing/summarizer/app/requirements.txt index 7f30524f..e1778e7b 100644 --- a/recipes/natural_language_processing/summarizer/app/requirements.txt +++ b/recipes/natural_language_processing/summarizer/app/requirements.txt @@ -1,4 +1,4 @@ langchain langchain_openai streamlit -pypdf +pymupdf diff --git a/recipes/natural_language_processing/summarizer/app/summarizer.py b/recipes/natural_language_processing/summarizer/app/summarizer.py index bb39bf21..64059561 100644 --- a/recipes/natural_language_processing/summarizer/app/summarizer.py +++ b/recipes/natural_language_processing/summarizer/app/summarizer.py @@ -2,10 +2,11 @@ from langchain_openai import ChatOpenAI from langchain.prompts import PromptTemplate from langchain_community.callbacks import StreamlitCallbackHandler -from langchain_community.document_loaders import PyPDFLoader +from langchain_community.document_loaders import PyMuPDFLoader import streamlit as st import tempfile import requests +import json import time import os @@ -35,15 +36,19 @@ def checking_model_service(): def chunk_text(text): chunks = [] - num_words = len(text.split()) - text_list = text.split() chunk_size = 1024 - num_chunks = (num_words//chunk_size)+1 - - for _ in range(num_chunks): - chunk = text_list[:chunk_size] - chunks.append(" ".join(chunk)) - text_list = text_list[chunk_size:] + tokens = requests.post(f"{model_service[:-2]}extras/tokenize/", + json={"input":text}).content + tokens = json.loads(tokens)["tokens"] + num_tokens = len(tokens) + num_chunks = (num_tokens//chunk_size)+1 + for i in range(num_chunks): + chunk = tokens[:chunk_size] + chunk = requests.post(f"{model_service[:-2]}extras/detokenize/", + json={"tokens":chunk}).content + chunk = json.loads(chunk)["text"] + chunks.append(chunk) + tokens = tokens[chunk_size:] return chunks def read_file(file): @@ -53,7 +58,7 @@ def read_file(file): temp = tempfile.NamedTemporaryFile() with open(temp.name, "wb") as f: f.write(file.getvalue()) - loader = PyPDFLoader(temp.name) + loader = PyMuPDFLoader(temp.name) pages = loader.load() text = "".join([p.page_content for p in pages])