-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats.py
66 lines (53 loc) · 2.33 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import numpy as np
from transformers import LongformerTokenizer
df = pd.read_pickle('pkls/mafia_raw.pkl', compression="gzip")
grouped_df = df.groupby(["author", "game_id"])
num_sentences_in_games = []
sentence_lens = []
num_tokens_in_games = []
tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')
for key, item in grouped_df:
posts = grouped_df.get_group(key).content.values # All the posts made by a user in a game
if len(posts) > 0: # Only consider games where user has spoken at least once
num_sentences_in_posts = []
num_tokens_in_posts = []
for post in posts:
sentences = post.split('\n\n')
num_sentences_in_post = 0
num_tokens_in_post = 0
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) > 0:
# sentence_lens.append(len(sentence))
num_sentences_in_post += 1
try:
tokens = tokenizer.encode(sentence)
except:
print(sentence)
num_tokens_in_post += len(tokens)
sentence_lens.append(len(tokens))
if num_sentences_in_post > 0:
num_sentences_in_posts.append(num_sentences_in_post)
num_tokens_in_posts.append(num_tokens_in_post)
# Only consider games in which user has at least said 10 sentences
if sum(num_sentences_in_posts) >= 10:
num_sentences_in_games.append(sum(num_sentences_in_posts))
num_tokens_in_games.append(sum(num_tokens_in_posts))
num_sentences_in_games = pd.Series(num_sentences_in_games)
num_tokens_in_games = pd.Series(num_tokens_in_games)
sentence_lens = pd.Series(sentence_lens)
print("For each document (user, game), the number of sentences:")
print(num_sentences_in_games.describe())
print("")
print("For each document (user, game), the number of tokens:")
print(num_tokens_in_games.describe())
# print("")
# print("All in all, the number of characters in a sentences")
# print(sentence_lens.describe())
print("")
print("All in all, the number of tokens in a sentence:")
print(sentence_lens.describe())
# print("")
# print("All in all, the number of words in a sentences")
# print(sentence_lens.describe())