chore: update something

lh0x00 · Jan 14, 2025 · 727ffc6 · 727ffc6
1 parent 5211460
commit 727ffc6
Show file tree

Hide file tree

Showing 4 changed files with 269 additions and 372 deletions.
diff --git a/docsifer/__init__.py b/docsifer/__init__.py
@@ -1,23 +1,26 @@
-import gradio as gr
-import requests
+# filename: __init__.py
+
 import json
 import logging
-import pandas as pd
+import tempfile
 from typing import Tuple
-from pathlib import Path
 
+import gradio as gr
+import pandas as pd
+import requests
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from gradio.routes import mount_gradio_app
+from pathlib import Path
 
-# If you are using scuid for unique filename generation, import it:
+# If you want to generate unique filenames, e.g. scuid:
 from scuid import scuid
-import tempfile
 
 
 # Filter out /v1 requests from the access log
 class LogFilter(logging.Filter):
     def filter(self, record):
+        # Only keep log records that contain "/v1" in the request path
         if record.args and len(record.args) >= 3:
             if "/v1" in str(record.args[2]):
                 return True
@@ -45,6 +48,7 @@ def filter(self, record):
 
 # Set your Docsifer API URL here (change host/port if needed)
 DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
+DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
 
 # Markdown description for the main interface
 APP_DESCRIPTION = f"""
@@ -96,38 +100,38 @@ def call_convert_api(
     cleanup: bool = True,
 ) -> Tuple[str, str]:
     """
-    Calls the /v1/convert endpoint, returning the Markdown or an error message.
-    Optionally returns a path for the Markdown file for Gradio download.
-
-    :param file_obj: In-memory uploaded file content.
-    :param filename: Unique or user-supplied filename (only used for identification).
-    :param openai_base_url: Custom base URL for OpenAI if advanced extraction is desired.
-    :param openai_api_key: API key for OpenAI if advanced extraction is desired.
-    :param cleanup: Toggle HTML cleanup for *.htm/*.html files.
-
-    Returns:
-        (markdown_content, temp_md_path)
+    Calls the /v1/convert endpoint, returning (markdown_content, md_file_path).
+    If there's an error, the first return value is an error message (str),
+    the second is an empty string.
+
+    The updated /v1/convert expects:
+      - file (UploadFile)
+      - openai_str: JSON string => {"api_key":"...", "base_url":"...", "model_id":"..."}
+      - settings_str: JSON string => {"cleanup": bool}
     """
-    data = {"cleanup": cleanup}
-    files = {}
-    headers = {}
-
-    # If we have in-memory content, do a multipart/form-data upload
-    if file_obj is not None:
-        files = {"file": (filename, file_obj)}
-    else:
+
+    if file_obj is None:
         return ("❌ No file was uploaded.", "")
 
-    # Include optional OpenAI parameters
-    if openai_base_url.strip():
-        data["openai_base_url"] = openai_base_url
+    # Build the JSON strings for the new API
+    openai_dict = {}
     if openai_api_key.strip():
-        data["openai_api_key"] = openai_api_key
+        openai_dict["api_key"] = openai_api_key
+    if openai_base_url.strip():
+        openai_dict["base_url"] = openai_base_url
+
+    settings_dict = {"cleanup": cleanup}
+
+    data = {
+        "openai_str": json.dumps(openai_dict),
+        "settings_str": json.dumps(settings_dict),
+    }
+
+    # Prepare files for multipart/form-data
+    files = {"file": (filename, file_obj)}
 
     try:
-        response = requests.post(
-            DOCSIFER_API_URL, files=files, data=data, headers=headers, timeout=30
-        )
+        response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
     except requests.exceptions.RequestException as e:
         return (f"❌ Network Error: {str(e)}", "")
 
@@ -136,43 +140,50 @@ def call_convert_api(
 
     try:
         converted = response.json()
+        # Expecting { "filename": "...", "markdown": "..." }
         markdown_content = converted["markdown"]
-        # Save the result to a temporary .md file so that Gradio can serve it for download
-        with tempfile.NamedTemporaryFile(
-            mode="w+", suffix=".md", dir="/tmp", delete=False
-        ) as tmp_file:
-            tmp_file.write(markdown_content)
-            tmp_file_path = tmp_file.name
-
-        return (markdown_content, tmp_file_path)
     except Exception as e:
         return (f"❌ Error parsing JSON: {str(e)}", "")
 
+    # Write the returned Markdown to a temporary .md file so Gradio can serve it
+    with tempfile.NamedTemporaryFile(
+        mode="w+", suffix=".md", dir="/tmp", delete=False
+    ) as tmp_file:
+        tmp_file.write(markdown_content)
+        tmp_md_path = tmp_file.name
+
+    return (markdown_content, tmp_md_path)
+
 
 def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
-    Calls the /v1/stats endpoint to retrieve analytics data.
-    Returns two DataFrames (access_df, tokens_df).
+    Calls /v1/stats endpoint to retrieve analytics data.
+    Returns two DataFrames: (access_df, tokens_df).
     """
-    url = "http://localhost:7860/v1/stats"
-
     try:
-        response = requests.get(url, timeout=10)
+        response = requests.get(DOCSIFER_STATS_URL, timeout=10)
     except requests.exceptions.RequestException as e:
         raise ValueError(f"Failed to fetch stats: {str(e)}")
 
     if response.status_code != 200:
         raise ValueError(f"Failed to fetch stats: {response.text}")
 
     data = response.json()
-    access_data = data["access"]
-    tokens_data = data["tokens"]
+    # Expected structure:
+    # {
+    #   "access": { <period>: {model: count, ...}, ... },
+    #   "tokens": { <period>: {model: count, ...}, ... }
+    # }
+    access_data = data.get("access", {})
+    tokens_data = data.get("tokens", {})
 
     def build_stats_df(bucket: dict) -> pd.DataFrame:
-        # Collect all model keys from total/daily/weekly/monthly/yearly
+        # We want columns for periods: total, daily, weekly, monthly, yearly
+        # Each row = one model
         all_models = set()
-        for period in ["total", "daily", "weekly", "monthly", "yearly"]:
-            all_models.update(bucket.get(period, {}).keys())
+        for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
+            period_dict = bucket.get(period_key, {})
+            all_models.update(period_dict.keys())
 
         result_dict = {
             "Model": [],
@@ -202,14 +213,14 @@ def create_main_interface():
     """
     Creates a Gradio Blocks interface:
     - A 'Conversion Playground' tab for uploading a file and converting to Markdown
-    - A 'Usage Statistics' tab to display usage stats
-    - An 'API Examples' tab for quick cURL reference
+    - An 'Analytics Stats' section to display usage statistics
+    - cURL examples for reference
     """
     with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
         gr.Markdown(APP_DESCRIPTION)
 
         with gr.Tab("Conversion Playground"):
-            gr.Markdown("### Convert your files to Markdown in a snap.")
+            gr.Markdown("### Convert your files to Markdown with Docsifer.")
 
             with gr.Row():
                 with gr.Column():
@@ -223,6 +234,7 @@ def create_main_interface():
                             ".html",
                             ".htm",
                             ".jpg",
+                            ".jpeg",
                             ".png",
                             ".mp3",
                             ".wav",
@@ -231,11 +243,10 @@ def create_main_interface():
                         type="binary",
                     )
 
-                    # Accordion for optional OpenAI config
                     with gr.Accordion("OpenAI Configuration (Optional)", open=True):
                         gr.Markdown(
                             "Provide these if you'd like **LLM-assisted** extraction. "
-                            "If left blank, basic conversion will be used."
+                            "If left blank, basic conversion (no LLM) will be used."
                         )
                         openai_base_url = gr.Textbox(
                             label="OpenAI Base URL",
@@ -248,10 +259,9 @@ def create_main_interface():
                             type="password",
                         )
 
-                    # Accordion for conversion settings
                     with gr.Accordion("Conversion Settings", open=True):
                         gr.Markdown(
-                            "Enable to remove undesired elements from `.html` files before conversion."
+                            "Enable to remove <style> tags or hidden elements from `.html` files before conversion."
                         )
                         cleanup_toggle = gr.Checkbox(
                             label="Enable Cleanup",
@@ -267,7 +277,7 @@ def create_main_interface():
                         interactive=False,
                     )
                     download_file = gr.File(
-                        label="Download Markdown File",
+                        label="Download Markdown File (.md)",
                         interactive=False,
                         visible=False,
                     )
@@ -279,10 +289,9 @@ def create_main_interface():
                         **Convert via File Upload (multipart/form-data)**:
                         ```bash
                         curl -X POST \\
-                            -F "file=@/path/local/document.pdf" \\
-                            -F "openai_api_key=sk-xxxxx" \\
-                            -F "openai_base_url=https://api.openai.com/v1" \\
-                            -F "cleanup=true" \\
+                            -F "file=@/path/to/local/document.pdf" \\
+                            -F "openai_str={\\"api_key\\":\\"sk-xxxxx\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
+                            -F "settings_str={\\"cleanup\\":true}" \\
                             http://localhost:7860/v1/convert
                         ```
 
@@ -299,15 +308,12 @@ def create_main_interface():
             def on_convert(file_bytes, base_url, api_key, cleanup):
                 """
                 Callback for the 'Convert' button.
-                We create a unique name using scuid if the user uploads a file.
+                We generate a unique name if the user uploads a file.
                 """
                 if not file_bytes:
                     return "❌ Please upload a file first.", None
 
-                # Create a unique name with the detected extension
                 unique_name = f"{scuid()}.tmp"
-
-                # Call the convert API with the file bytes
                 markdown, temp_md_path = call_convert_api(
                     file_obj=file_bytes,
                     filename=unique_name,
@@ -338,6 +344,7 @@ def on_convert(file_bytes, base_url, api_key, cleanup):
                 headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
                 interactive=False,
             )
+
             stats_btn.click(
                 fn=call_stats_api_df,
                 inputs=[],