-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy path4. ngram.py
30 lines (23 loc) · 1006 Bytes
/
4. ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import nltk, re
from nltk.tokenize import word_tokenize
# importing ngrams module from nltk
from nltk.util import ngrams
from collections import Counter
from looking_glass import looking_glass_full_text
cleaned = re.sub('\W+', ' ', looking_glass_full_text).lower()
tokenized = word_tokenize(cleaned)
# Change the n value to 2:
looking_glass_bigrams = ngrams(tokenized, 2)
looking_glass_bigrams_frequency = Counter(looking_glass_bigrams)
# Change the n value to 3:
looking_glass_trigrams = ngrams(tokenized, 3)
looking_glass_trigrams_frequency = Counter(looking_glass_trigrams)
# Change the n value to a number greater than 3:
looking_glass_ngrams = ngrams(tokenized, 10)
looking_glass_ngrams_frequency = Counter(looking_glass_ngrams)
print("Looking Glass Bigrams:")
print(looking_glass_bigrams_frequency.most_common(10))
print("\nLooking Glass Trigrams:")
print(looking_glass_trigrams_frequency.most_common(10))
print("\nLooking Glass n-grams:")
print(looking_glass_ngrams_frequency.most_common(10))