-
Notifications
You must be signed in to change notification settings - Fork 1
/
SplitCorpus.py
executable file
·80 lines (70 loc) · 4.96 KB
/
SplitCorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import librosa, os
import numpy as np
import pandas as pd
from glob import glob
import tqdm
from ArtifactsVector import ArtifactsVector
from Corpus import Corpus
from plot_log_population import plot_log_population
from plot_log_population2 import plot_log_population2
from sample_statistics import sample_statistics as stats
from text_of_file import text_of_file
from sos import sos
class SplitCorpus (Corpus):
def __init__(self, _config, _artifacts):
super().__init__(_config, _artifacts)
self.population = ArtifactsVector(_config, _artifacts)
@classmethod
def transcript_split(cls, _config, _recordings):
_artifacts = []
for artifact in _recordings.artifacts:
_artifacts.extend(artifact.transcript_split())
return cls(_config, _artifacts)
@classmethod
def pool_split_on_silence(cls, _config, _pool, _recordings, _goal_length_in_seconds):
tasks = [(artifact, _goal_length_in_seconds) for artifact in _recordings.artifacts]
splits = list(tqdm.tqdm(_pool.imap(sos, tasks), total=len(tasks)))
_artifacts = [x for split in splits for x in split]
return cls(_config, _artifacts)
@classmethod
def split_on_silence(cls, _config, _recordings, _goal_length_in_seconds):
_artifacts = []
for i, artifact in enumerate(_recordings.artifacts):
print(f"[{i}] PROCESSING {artifact.key}")
_artifacts.extend(artifact.split_on_silence(goal_length_in_seconds=_goal_length_in_seconds))
return cls(_config, _artifacts)
def visualization(self):
plot_log_population(self.population.N_splits_per_root, 'Splits per 10-minute recording', '# splits per recording', '# recordings with this many splits', 100)
plot_log_population(self.population.word_lengths_in_graphemes, 'Word lengths', 'Graphemes/word', 'Words with this many graphemes', 12)
plot_log_population(self.population.samples_per_grapheme, 'Audio samples per grapheme', 'Samples/grapheme', 'Graphemes that are this long in samples', 100)
plot_log_population(self.population.samples_per_word, 'Audio samples per word', 'Samples/word', 'Words that are this long in samples', 100)
plot_log_population(self.population.split_length_in_words, 'Splits with this many words', 'word length', 'splits', 100)
plot_log_population(self.population.split_length_in_graphemes, "splits with this many graphemes", 'grapheme length', 'splits', 100)
plot_log_population(self.population.split_length_in_seconds, "Splits with this many seconds length", 'sample length (seconds)', 'splits', 100)
def sample_statistics(self):
R = [('Words', '#Words', 'Distinct words in all recordings', self.population.N_all_words),
('Graphemes', '#Graphemes', 'Distinct graphemes in all transcriptions', self.population.N_all_graphemes),
('Splits', '#Splits', 'Splits in all recordings', self.n_artifacts)]
R.extend(stats('Words', 'Length in graphemes', self.population.word_lengths_in_graphemes))
R.extend(self.population.sample_statistics())
return pd.DataFrame(R, columns=self.columns).sort_values(by=['Corpus', 'Units', 'Measurement']).reset_index(drop=True)
def diff_visualization(self, new):
plot_log_population2(self.population.word_lengths_in_graphemes, new.population.word_lengths_in_graphemes,
'Word lengths', 'Graphemes/word', 'Words with this many graphemes', 12)
plot_log_population2(self.population.samples_per_grapheme, new.population.samples_per_grapheme,
'Audio samples per grapheme', 'Samples/grapheme', 'Graphemes that are this long in samples', 100)
plot_log_population2(self.population.samples_per_word, new.population.samples_per_word,
'Audio samples per word', 'Samples/word', 'Words that are this long in samples', 100)
plot_log_population2(self.population.split_length_in_words, new.population.split_length_in_words,
'Splits with this many words', 'word length', 'splits', 100)
plot_log_population2(self.population.split_length_in_graphemes, new.population.split_length_in_graphemes,
"splits with this many graphemes", 'grapheme length', 'splits', 100)
plot_log_population2(self.population.split_length_in_seconds, new.population.split_length_in_seconds,
"Splits with this many seconds length", 'sample length (seconds)', 'splits', 100)
def diff_sample_statistics(self, new):
df_old = self.sample_statistics()
df_new = new.sample_statistics()
df = pd.merge(df_old, df_new, how='inner', on=df_old.columns.values.tolist()[0:-1])
return df[df.Value_x != df.Value_y]
def check_vocabulary_change(self, new):
return list(sorted(set(self.population.all_words).difference(set(new.population.all_words))))