-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
193 lines (179 loc) · 7.37 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from collections import namedtuple
import pytesseract
import argparse
import imutils
import cv2
import streamlit as st
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def cleanup_text(text):
# strip out non-ASCII text so we can draw the text on the image
# using OpenCV
return "".join([c if ord(c) < 128 else "" for c in text]).strip()
# construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument("-i", "--image", required=True,
# help="path to input image that we'll align to template")
# ap.add_argument("-t", "--template", required=True,
# help="path to input template image")
# args = vars(ap.parse_args())
# create a named tuple which we can use to create locations of the
# input document which we wish to OCR
OCRLocation = namedtuple("OCRLocation", ["id", "bbox",
"filter_keywords"])
# define the locations of each area of the document we wish to OCR
OCR_LOCATIONS = [
OCRLocation("name", (410, 70, 712, 77),
[]),
OCRLocation("phone", (63, 440, 159, 22),
[]),
OCRLocation("email", (65, 514, 313, 23),
[]),
OCRLocation("address", (63, 590, 324, 66),
[]),
OCRLocation("expertise", (87, 1034, 297, 302),
[]),
OCRLocation("languages", (56, 1400, 331, 194),
[])
]
# load the input image and template from disk
# print("[INFO] loading images...")
# image = cv2.imread(args["image"])
# template = cv2.imread(args["template"])
# aligned = image
# initialize a results list to store the document OCR parsing results
# print("[INFO] OCR'ing document...")
# parsingResults = []
# # loop over the locations of the document we are going to OCR
# for loc in OCR_LOCATIONS:
# # extract the OCR ROI from the aligned image
# (x, y, w, h) = loc.bbox
# roi = aligned[y:y + h, x:x + w]
# # OCR the ROI using Tesseract
# rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
# text = pytesseract.image_to_string(rgb)
# # break the text into lines and loop over them
# for line in text.split("\n"):
# # if the line is empty, ignore it
# if len(line) == 0:
# continue
# # convert the line to lowercase and then check to see if the
# # line contains any of the filter keywords (these keywords
# # are part of the *form itself* and should be ignored)
# lower = line.lower()
# count = sum([lower.count(x) for x in loc.filter_keywords])
# # if the count is zero then we know we are *not* examining a
# # text field that is part of the document itself (ex., info,
# # on the field, an example, help text, etc.)
# if count == 0:
# # update our parsing results dictionary with the OCR'd
# # text if the line is *not* empty
# parsingResults.append((loc, line))
# # initialize a dictionary to store our final OCR results
# results = {}
# # loop over the results of parsing the document
# for (loc, line) in parsingResults:
# # grab any existing OCR result for the current ID of the document
# r = results.get(loc.id, None)
# # if the result is None, initialize it using the text and location
# # namedtuple (converting it to a dictionary as namedtuples are not
# # hashable)
# if r is None:
# results[loc.id] = (line, loc._asdict())
# # otherwise, there exists an OCR result for the current area of the
# # document, so we should append our existing line
# else:
# # unpack the existing OCR result and append the line to the
# # existing text
# (existingText, loc) = r
# text = "{}\n{}".format(existingText, line)
# # update our results dictionary
# results[loc["id"]] = [text, loc]
# loop over the results
# for (locID, result) in results.items():
# # unpack the result tuple
# (text, loc) = result
# # display the OCR result to our terminal
# print(loc["id"])
# print("=" * len(loc["id"]))
# print("{}\n\n".format(text))
# # extract the bounding box coordinates of the OCR location and
# # then strip out non-ASCII text so we can draw the text on the
# # output image using OpenCV
# (x, y, w, h) = loc["bbox"]
# clean = cleanup_text(text)
# # draw a bounding box around the text
# cv2.rectangle(aligned, (x, y), (x + w, y + h), (0, 255, 0), 2)
# # loop over all lines in the text
# for (i, line) in enumerate(text.split("\n")):
# # draw the line on the output image
# startY = y + (i * 70) + 40
# cv2.putText(aligned, line, (x, startY),
# cv2.FONT_HERSHEY_SIMPLEX, 1.8, (0, 0, 255), 5)
with open("style.css") as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
st.set_option('deprecation.showfileUploaderEncoding', False)
st.title("Extracting Information From Resumes")
resume = st.file_uploader("Upload image", type=["jpg", "png", "jpeg"], label_visibility="hidden")
from PIL import Image, ImageOps
if resume is None:
st.text("Please upload an image file")
else:
resume_image = cv2.imread(r'{}'.format(resume.name))
st.image(cv2.cvtColor(resume_image, cv2.COLOR_BGR2RGB), use_column_width=True)
aligned = resume_image
parsingResults = []
# loop over the locations of the document we are going to OCR
for loc in OCR_LOCATIONS:
# extract the OCR ROI from the aligned image
(x, y, w, h) = loc.bbox
roi = aligned[y:y + h, x:x + w]
# OCR the ROI using Tesseract
rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
text = pytesseract.image_to_string(rgb)
# break the text into lines and loop over them
for line in text.split("\n"):
# if the line is empty, ignore it
if len(line) == 0:
continue
# convert the line to lowercase and then check to see if the
# line contains any of the filter keywords (these keywords
# are part of the *form itself* and should be ignored)
lower = line.lower()
count = sum([lower.count(x) for x in loc.filter_keywords])
# if the count is zero then we know we are *not* examining a
# text field that is part of the document itself (ex., info,
# on the field, an example, help text, etc.)
if count == 0:
# update our parsing results dictionary with the OCR'd
# text if the line is *not* empty
parsingResults.append((loc, line))
# initialize a dictionary to store our final OCR results
results = {}
# loop over the results of parsing the document
for (loc, line) in parsingResults:
# grab any existing OCR result for the current ID of the document
r = results.get(loc.id, None)
# if the result is None, initialize it using the text and location
# namedtuple (converting it to a dictionary as namedtuples are not
# hashable)
if r is None:
results[loc.id] = (line, loc._asdict())
# otherwise, there exists an OCR result for the current area of the
# document, so we should append our existing line
else:
# unpack the existing OCR result and append the line to the
# existing text
(existingText, loc) = r
text = "{}\n{}".format(existingText, line)
# update our results dictionary
results[loc["id"]] = [text, loc]
for (locID, result) in results.items():
(text, loc) = result
st.subheader(loc["id"].title())
if loc["id"] in ["expertise", "languages"]:
lines = text.split('\n')
for line in lines:
st.write("{}".format(line))
else:
st.write("{}\n\n".format(text))
st.divider()