-
Notifications
You must be signed in to change notification settings - Fork 35
/
tesseract_features.py
69 lines (59 loc) · 3 KB
/
tesseract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
'''
AAA lllllll lllllll iiii
A:::A l:::::l l:::::l i::::i
A:::::A l:::::l l:::::l iiii
A:::::::A l:::::l l:::::l
A:::::::::A l::::l l::::l iiiiiii eeeeeeeeeeee
A:::::A:::::A l::::l l::::l i:::::i ee::::::::::::ee
A:::::A A:::::A l::::l l::::l i::::i e::::::eeeee:::::ee
A:::::A A:::::A l::::l l::::l i::::i e::::::e e:::::e
A:::::A A:::::A l::::l l::::l i::::i e:::::::eeeee::::::e
A:::::AAAAAAAAA:::::A l::::l l::::l i::::i e:::::::::::::::::e
A:::::::::::::::::::::A l::::l l::::l i::::i e::::::eeeeeeeeeee
A:::::AAAAAAAAAAAAA:::::A l::::l l::::l i::::i e:::::::e
A:::::A A:::::A l::::::ll::::::li::::::ie::::::::e
A:::::A A:::::A l::::::ll::::::li::::::i e::::::::eeeeeeee
A:::::A A:::::A l::::::ll::::::li::::::i ee:::::::::::::e
AAAAAAA AAAAAAAlllllllllllllllliiiiiiii eeeeeeeeeeeeee
______ _ ___ ______ _____
| ___| | | / _ \ | ___ \_ _| _
| |_ ___ __ _| |_ _ _ _ __ ___ ___ / /_\ \| |_/ / | | (_)
| _/ _ \/ _` | __| | | | '__/ _ \/ __| | _ || __/ | |
| || __/ (_| | |_| |_| | | | __/\__ \ | | | || | _| |_ _
\_| \___|\__,_|\__|\__,_|_| \___||___/ \_| |_/\_| \___/ (_)
_____
|_ _|
| | _ __ ___ __ _ __ _ ___
| || '_ ` _ \ / _` |/ _` |/ _ \
_| || | | | | | (_| | (_| | __/
\___/_| |_| |_|\__,_|\__, |\___|
__/ |
|___/
Extracts image features if default_image_features = ['tessearct_features']
Transcribes image files with OCR (pytesseract module) and then featurizes
these transcripts with nltk_features. Read more about nltk_features @
https://github.com/jim-schwoebel/allie/blob/master/features/text_features/nltk_features.py
'''
import os, sys
from PIL import Image
import pytesseract
def prev_dir(directory):
g=directory.split('/')
# print(g)
lastdir=g[len(g)-1]
i1=directory.find(lastdir)
directory=directory[0:i1]
return directory
directory=os.getcwd()
prev_dir=prev_dir(directory)
sys.path.append(prev_dir+'/text_features')
import nltk_features as nf
os.chdir(directory)
def transcribe_image(imgfile):
transcript=pytesseract.image_to_string(Image.open(imgfile))
return transcript
def tesseract_featurize(imgfile):
# can stitch across an entire length of video frames too
transcript=transcribe_image(imgfile)
features, labels = nf.nltk_featurize(transcript)
return transcript, features, labels