-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
84 lines (64 loc) · 3.06 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import glob
import sys, os
import time
from Text_Extractor import PDF_Reader,Docx_Reader,txt_Reader
from DB_File import create_db,insert_values_into_files_path,insert_values_into_word_path,\
insert_values_into_file_time_stamps
# we are taking the bigrams and trigrams
def convert_2_Bi_trigrams(word_list):
bigrams,trigrams = [],[]
for i in range(len(word_list)-2):
val = word_list[i] + ' ' + word_list[i+1]
bigrams.append(val)
for i in range(len(word_list)-3):
val = word_list[i] + ' ' + word_list[i+1] + ' ' +word_list[i+2]
trigrams.append(val)
return bigrams+trigrams
if __name__ == '__main__':
Path_list = []
rootdir = input("Enter folder to be searched: ")
rootdir += "/"
# we are obtaining the files in the directory and subdirectories
for path in glob.glob(f'{rootdir}**/*',recursive = True):
split_tup = os.path.splitext(path)
print(path)
if split_tup[1] == '.docx' or split_tup[1]== '.pdf' or split_tup[1] == '.txt':
Path_list.append(path)
create_db()# creates tables
# Processing the PDF files
for path in Path_list:
split_tup = os.path.splitext(path)
file_name = split_tup[0].split("/")[-1]
if split_tup[1]== '.pdf':
time_stamp = os.path.getctime(path)
insert_values_into_files_path((file_name.lower(),path))# Pushing the file name and path into the DB
insert_values_into_files_path(('.pdf',path))# Pushing type of the file and path into DB
insert_values_into_file_time_stamps((file_name.lower(),path,time_stamp))# Pusing the file and time stamps(created time)
word_list = PDF_Reader(path)
word_list += convert_2_Bi_trigrams(word_list)
for word in word_list:
insert_values_into_word_path((word,path)) #Inserting the words(bigrams+trigrams) into the DB
for path in Path_list:
split_tup = os.path.splitext(path)
file_name = split_tup[0].split("/")[-1]
if split_tup[1]== '.docx':
time_stamp = os.path.getctime(path)
insert_values_into_files_path((file_name.lower(),path))# Pushing the file name and path into the DB
insert_values_into_files_path(('.docx',path))# Pushing type of the file and path into DB
insert_values_into_file_time_stamps((file_name.lower(),path,time_stamp))# Pusing the file and time stamps(created time)
word_list = Docx_Reader(path)
word_list += convert_2_Bi_trigrams(word_list)
for word in word_list:
insert_values_into_word_path((word,path))#Inserting the words(bigrams+trigrams) into the DB
for path in Path_list:
split_tup = os.path.splitext(path)
file_name = split_tup[0].split("/")[-1]
if split_tup[1]== '.txt':
time_stamp = os.path.getctime(path)
insert_values_into_files_path((file_name.lower(),path))# Pushing the file name and path into the DB
insert_values_into_files_path(('.txt',path))# Pushing type of the file and path into DB
insert_values_into_file_time_stamps((file_name.lower(),path,time_stamp))# Pusing the file and time stamps(created time)
word_list = txt_Reader(path)
word_list += convert_2_Bi_trigrams(word_list)
for word in word_list:
insert_values_into_word_path((word,path))#Inserting the words(bigrams+trigrams) into the DB