-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_pdf.py
65 lines (48 loc) · 1.95 KB
/
read_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import jieba
jieba.enable_paddle()
jieba.enable_parallel(4)
import jieba.posseg as pseg
pdf_file = '/Users/vanxv/Downloads/500Libary/chaoyuezhishang.pdf'
import pdftotext
from collections import Counter
from wordcloud import WordCloud
import re
# 过滤不了\\ \ 中文()还有————
r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'#用户也可以在此进行自定义过滤字符
# 者中规则也过滤不完全
r2 = "[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+"
# \\\可以过滤掉反向单杠和双杠,/可以过滤掉正向单杠和双杠,第一个中括号里放的是英文符号,第二个中括号里放的是中文符号,第二个中括号前不能少|,否则过滤不完全
r3 = "[.!//_,$&%^*()<>+\"'?@#-|:~{}]+|[——!\\\\,。=?、:“”‘’《》【】¥……()]+"
# 去掉括号和括号内的所有内容
r4 = "\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——!\\\,。=?、:“”‘’¥……()《》【】]"
def analysis(text):
cnt = Counter()
for word in text:
cnt[word] += 1
cnt.most_common()
print(cnt)
for x in cnt.most_common():
print(x)
def deal_text(text):
flag_array = ("n","t",'PER','ORG','ORG','TIME') # https://github.com/fxsjy/jieba
text = text.replace("\n", " ")
text = text.replace(" ", "")
text = text.replace(" ", "")
text = text.replace(",", "")
text = text.replace("。", "")
text = re.sub(r4, '', text)
seg_list = pseg.cut(text,use_paddle=True)
new_text = []
for word, flag in seg_list:
# print('%s %s' % (word, flag))
if(flag.endswith(flag_array)):
new_text.append(word)
analysis(new_text)
def read_pdf():
text = ''
with open(pdf_file, "rb") as f:
pdf = pdftotext.PDF(f)
for page in pdf:
text +=page
deal_text(text)
read_pdf()