-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
84 lines (67 loc) · 2.91 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from flask import Flask, request, render_template, redirect
import gradio as gr
import fitz
import os
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
#app = Flask(__name__)
tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
def preprocess_text(text, max_length):
return [text[i:i + max_length] for i in range(0, len(text), max_length)]
def pdf_to_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
subheaders = []
for page in doc:
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
if span["size"] > 12: # Assuming subheaders have a font size greater than 12
subheaders.append(span["text"])
text += span["text"] + " "
return text, subheaders
def summarize(text):
input_length = len(text.split())
max_length = max(50, int(input_length / 3))
summarized_text = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
return summarized_text[0]['summary_text']
def process_pdf(file):
text, subheaders = pdf_to_text(file.name)
parts = preprocess_text(text, 500)
summaries = [summarize(part) for part in parts]
full_summary = " ".join(summaries)
return full_summary, "\n".join(subheaders)
interface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label='Upload PDF'),
outputs=[gr.Textbox(label='Summary'), gr.Textbox(label='Subheaders')],
title= "PDF Summarizer",
description="Upload a PDF to get a summarized text")
interface.launch(share=True)
# @app.route('/')
# def index():
# return render_template('index.html')
# @app.route('/upload', methods=['POST'])
# def upload():
# if 'file' not in request.files:
# return redirect(request.url)
# file = request.files['file']
# if file.filename == '':
# return redirect(request.url)
# if file and file.filename.endswith('.pdf'):
# upload_folder = './uploads'
# if not os.path.exists(upload_folder):
# os.makedirs(upload_folder)
# file_path = os.path.join(upload_folder, file.filename)
# file.save(file_path)
# text = pdf_to_text(file_path)
# parts = preprocess_text(text, 500) # Assuming a safe size under model's max length
# summaries = [summarize(part) for part in parts]
# full_summary = " ".join(summaries) # Join all partial summaries
# return render_template('result.html', summary=full_summary)
# return redirect(request.url)
# if __name__ == '__main__':
# app.run(debug=True)