Skip to content

Commit

Permalink
chore: update something
Browse files Browse the repository at this point in the history
  • Loading branch information
lh0x00 committed Jan 14, 2025
1 parent 5211460 commit 727ffc6
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 372 deletions.
139 changes: 73 additions & 66 deletions docsifer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import gradio as gr
import requests
# filename: __init__.py

import json
import logging
import pandas as pd
import tempfile
from typing import Tuple
from pathlib import Path

import gradio as gr
import pandas as pd
import requests
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from gradio.routes import mount_gradio_app
from pathlib import Path

# If you are using scuid for unique filename generation, import it:
# If you want to generate unique filenames, e.g. scuid:
from scuid import scuid
import tempfile


# Filter out /v1 requests from the access log
class LogFilter(logging.Filter):
def filter(self, record):
# Only keep log records that contain "/v1" in the request path
if record.args and len(record.args) >= 3:
if "/v1" in str(record.args[2]):
return True
Expand Down Expand Up @@ -45,6 +48,7 @@ def filter(self, record):

# Set your Docsifer API URL here (change host/port if needed)
DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"

# Markdown description for the main interface
APP_DESCRIPTION = f"""
Expand Down Expand Up @@ -96,38 +100,38 @@ def call_convert_api(
cleanup: bool = True,
) -> Tuple[str, str]:
"""
Calls the /v1/convert endpoint, returning the Markdown or an error message.
Optionally returns a path for the Markdown file for Gradio download.
:param file_obj: In-memory uploaded file content.
:param filename: Unique or user-supplied filename (only used for identification).
:param openai_base_url: Custom base URL for OpenAI if advanced extraction is desired.
:param openai_api_key: API key for OpenAI if advanced extraction is desired.
:param cleanup: Toggle HTML cleanup for *.htm/*.html files.
Returns:
(markdown_content, temp_md_path)
Calls the /v1/convert endpoint, returning (markdown_content, md_file_path).
If there's an error, the first return value is an error message (str),
the second is an empty string.
The updated /v1/convert expects:
- file (UploadFile)
- openai_str: JSON string => {"api_key":"...", "base_url":"...", "model_id":"..."}
- settings_str: JSON string => {"cleanup": bool}
"""
data = {"cleanup": cleanup}
files = {}
headers = {}

# If we have in-memory content, do a multipart/form-data upload
if file_obj is not None:
files = {"file": (filename, file_obj)}
else:

if file_obj is None:
return ("❌ No file was uploaded.", "")

# Include optional OpenAI parameters
if openai_base_url.strip():
data["openai_base_url"] = openai_base_url
# Build the JSON strings for the new API
openai_dict = {}
if openai_api_key.strip():
data["openai_api_key"] = openai_api_key
openai_dict["api_key"] = openai_api_key
if openai_base_url.strip():
openai_dict["base_url"] = openai_base_url

settings_dict = {"cleanup": cleanup}

data = {
"openai_str": json.dumps(openai_dict),
"settings_str": json.dumps(settings_dict),
}

# Prepare files for multipart/form-data
files = {"file": (filename, file_obj)}

try:
response = requests.post(
DOCSIFER_API_URL, files=files, data=data, headers=headers, timeout=30
)
response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
except requests.exceptions.RequestException as e:
return (f"❌ Network Error: {str(e)}", "")

Expand All @@ -136,43 +140,50 @@ def call_convert_api(

try:
converted = response.json()
# Expecting { "filename": "...", "markdown": "..." }
markdown_content = converted["markdown"]
# Save the result to a temporary .md file so that Gradio can serve it for download
with tempfile.NamedTemporaryFile(
mode="w+", suffix=".md", dir="/tmp", delete=False
) as tmp_file:
tmp_file.write(markdown_content)
tmp_file_path = tmp_file.name

return (markdown_content, tmp_file_path)
except Exception as e:
return (f"❌ Error parsing JSON: {str(e)}", "")

# Write the returned Markdown to a temporary .md file so Gradio can serve it
with tempfile.NamedTemporaryFile(
mode="w+", suffix=".md", dir="/tmp", delete=False
) as tmp_file:
tmp_file.write(markdown_content)
tmp_md_path = tmp_file.name

return (markdown_content, tmp_md_path)


def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Calls the /v1/stats endpoint to retrieve analytics data.
Returns two DataFrames (access_df, tokens_df).
Calls /v1/stats endpoint to retrieve analytics data.
Returns two DataFrames: (access_df, tokens_df).
"""
url = "http://localhost:7860/v1/stats"

try:
response = requests.get(url, timeout=10)
response = requests.get(DOCSIFER_STATS_URL, timeout=10)
except requests.exceptions.RequestException as e:
raise ValueError(f"Failed to fetch stats: {str(e)}")

if response.status_code != 200:
raise ValueError(f"Failed to fetch stats: {response.text}")

data = response.json()
access_data = data["access"]
tokens_data = data["tokens"]
# Expected structure:
# {
# "access": { <period>: {model: count, ...}, ... },
# "tokens": { <period>: {model: count, ...}, ... }
# }
access_data = data.get("access", {})
tokens_data = data.get("tokens", {})

def build_stats_df(bucket: dict) -> pd.DataFrame:
# Collect all model keys from total/daily/weekly/monthly/yearly
# We want columns for periods: total, daily, weekly, monthly, yearly
# Each row = one model
all_models = set()
for period in ["total", "daily", "weekly", "monthly", "yearly"]:
all_models.update(bucket.get(period, {}).keys())
for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
period_dict = bucket.get(period_key, {})
all_models.update(period_dict.keys())

result_dict = {
"Model": [],
Expand Down Expand Up @@ -202,14 +213,14 @@ def create_main_interface():
"""
Creates a Gradio Blocks interface:
- A 'Conversion Playground' tab for uploading a file and converting to Markdown
- A 'Usage Statistics' tab to display usage stats
- An 'API Examples' tab for quick cURL reference
- An 'Analytics Stats' section to display usage statistics
- cURL examples for reference
"""
with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
gr.Markdown(APP_DESCRIPTION)

with gr.Tab("Conversion Playground"):
gr.Markdown("### Convert your files to Markdown in a snap.")
gr.Markdown("### Convert your files to Markdown with Docsifer.")

with gr.Row():
with gr.Column():
Expand All @@ -223,6 +234,7 @@ def create_main_interface():
".html",
".htm",
".jpg",
".jpeg",
".png",
".mp3",
".wav",
Expand All @@ -231,11 +243,10 @@ def create_main_interface():
type="binary",
)

# Accordion for optional OpenAI config
with gr.Accordion("OpenAI Configuration (Optional)", open=True):
gr.Markdown(
"Provide these if you'd like **LLM-assisted** extraction. "
"If left blank, basic conversion will be used."
"If left blank, basic conversion (no LLM) will be used."
)
openai_base_url = gr.Textbox(
label="OpenAI Base URL",
Expand All @@ -248,10 +259,9 @@ def create_main_interface():
type="password",
)

# Accordion for conversion settings
with gr.Accordion("Conversion Settings", open=True):
gr.Markdown(
"Enable to remove undesired elements from `.html` files before conversion."
"Enable to remove <style> tags or hidden elements from `.html` files before conversion."
)
cleanup_toggle = gr.Checkbox(
label="Enable Cleanup",
Expand All @@ -267,7 +277,7 @@ def create_main_interface():
interactive=False,
)
download_file = gr.File(
label="Download Markdown File",
label="Download Markdown File (.md)",
interactive=False,
visible=False,
)
Expand All @@ -279,10 +289,9 @@ def create_main_interface():
**Convert via File Upload (multipart/form-data)**:
```bash
curl -X POST \\
-F "file=@/path/local/document.pdf" \\
-F "openai_api_key=sk-xxxxx" \\
-F "openai_base_url=https://api.openai.com/v1" \\
-F "cleanup=true" \\
-F "file=@/path/to/local/document.pdf" \\
-F "openai_str={\\"api_key\\":\\"sk-xxxxx\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
-F "settings_str={\\"cleanup\\":true}" \\
http://localhost:7860/v1/convert
```
Expand All @@ -299,15 +308,12 @@ def create_main_interface():
def on_convert(file_bytes, base_url, api_key, cleanup):
"""
Callback for the 'Convert' button.
We create a unique name using scuid if the user uploads a file.
We generate a unique name if the user uploads a file.
"""
if not file_bytes:
return "❌ Please upload a file first.", None

# Create a unique name with the detected extension
unique_name = f"{scuid()}.tmp"

# Call the convert API with the file bytes
markdown, temp_md_path = call_convert_api(
file_obj=file_bytes,
filename=unique_name,
Expand Down Expand Up @@ -338,6 +344,7 @@ def on_convert(file_bytes, base_url, api_key, cleanup):
headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
interactive=False,
)

stats_btn.click(
fn=call_stats_api_df,
inputs=[],
Expand Down
Loading

0 comments on commit 727ffc6

Please sign in to comment.