-
Notifications
You must be signed in to change notification settings - Fork 0
/
veda.py
691 lines (537 loc) · 28.6 KB
/
veda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
import os
import argparse
import glob
import re
import csv
import json
import metronome as met
import pandas as pd
from bs4 import BeautifulSoup
from utils.transliteration import transliterate
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import seaborn as sns
import ast
class VedaMetronome():
def __init__(self):
self.base_dir = os.path.dirname(os.path.abspath(__file__))
self.src_dir = os.path.join(self.base_dir, 'src')
self.json_path = os.path.join(self.src_dir, "text.json")
def _create_text_dict(self):
''' Create a dictionary mapping verse identifiers to corresponding verses.
This function reads TEI-encoded text files from a specified directory,
extracts verse content, and constructs a dictionary where the keys
are identifiers in the format bXX_hYYY_ZZ (representing book XX, hymn YY,
and verse ZZ), and the values are the concatenated verses of the text.
Returns:
dict: A dictionary where keys are verse identifiers and values are
concatenated verse texts.
Note:
The verse identifiers follow the pattern bXX_hYYY_ZZ, where:
- XX represents the book number.
- YY represents the hymn number.
- ZZ represents the verse number within the hymn.
'''
text_path_list = glob.glob(os.path.join(self.src_dir, '*.tei'))
text_dict = {}
for text_path in text_path_list:
with open(text_path, mode='r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'xml')
lg_tags = soup.find_all('lg', {'source': 'vnh'})
for lg_tag in lg_tags:
lg_id = lg_tag.get('xml:id').replace("_vnh", "")
l_texts = [l_tag.text.strip() for l_tag in lg_tag.find_all('l')]
l_text_all = " |".join(l_texts)
l_text_all += "|"
text_dict[lg_id] = l_text_all
with open(self.json_path, mode='w') as f:
json.dump(text_dict, f)
return text_dict
def _convert_to_skeleton(self, text: str):
'''Convert a given text into a phonemic skeleton representation.
This function takes a text as input and processes each phoneme to generate a phonemic
skeleton representation. The skeleton is constructed using the following symbols:
- "V" for short vowels
- "W" for long vowels
- "C" for consonants
- "H" for double consonants
- "." for spaces
- "|" for vertical lines
Phonemes that do not match any of these categories are considered invalid.
Args:
text (str): The input text to be converted into a phonemic skeleton.
Returns:
str: The phonemic skeleton representation of the input text.
Note:
- The function handles both short and long vowels separately.
- Accent marks such as ";" and ":" are ignored and not included in the skeleton.
- If an invalid phoneme is encountered, an error message is printed and the
function terminates, returning an incomplete skeleton.
'''
skeleton = ""
for phoneme in text:
if phoneme in self.short_vowel_list:
# vowel
skeleton += "V"
elif phoneme in self.long_vowel_list:
# double vowel
skeleton += "W"
elif phoneme in self.consonant_list:
# consonant
skeleton += "C"
elif phoneme in self.double_consonant_list:
# double consonant
skeleton += "H"
elif phoneme == " ":
# space
skeleton += "."
elif phoneme == "|":
# vertical line
skeleton += "|"
elif phoneme in [";", ":"]:
continue
else:
print(f"{phoneme} is not in phoneme list.")
break
return skeleton
def _clean_text(self, text: str):
'''Clean the input text by removing specified characters.
This function removes specific characters and patterns from the given input text
using regular expressions. The characters and patterns removed include:
- "3", "@", "\", "-", "+", "'", "&", "~", "*", "/", "`", and specified Unicode characters.
Args:
text (str): The input text to be cleaned.
Returns:
str: The cleaned text with the specified characters and patterns removed.
'''
text = re.sub(r"[3@\\\-\+\'&~\*/\`\u0300]", "", text)
return text
def _transform_text_to_metronome(self, text: str):
'''Transform the input text into a metronomic rhythm representation.
This function takes a text in a specified transliteration scheme (IAST) and
performs a series of transformations to generate a metronomic rhythm representation.
The process involves converting the text to a phonemic skeleton, analyzing the
syllable structure, and assigning metronomic symbols based on vowel weights and
consonant counts.
Args:
text (str): The input text in the specified transliteration scheme (IAST).
Returns:
str: The metronomic rhythm representation of the input text.
Note:
- The function uses specific lists to identify short and long vowels, consonants,
and double consonants.
- Vowel weight is determined by a dictionary (vowel_weight) where "V" represents
short vowels and "W" represents long vowels.
- The syllable structure is determined using a phonemic skeleton generated by
another function (_convert_to_skeleton).
- The resulting metronomic rhythm representation uses "S" for stressed syllables
and "w" for unstressed syllables.
'''
text_slp1 = transliterate(text, "iast", "slp1")
text_slp1 = self._clean_text(text_slp1)
self.short_vowel_list = ["a", "i", "u", "f", "x"]
self.long_vowel_list = ["A", "I", "U", "F", "e", "o", "E", "O"]
self.consonant_list = ["k", "K", "g", "G", "N", "c", "j", "J", "Y", "w", "W", "q", "R", "t", "T", "d", "D", "n", "p", "P", "b", "B", "m", "y", "r", "l", "v", "S", "z", "s", "h", "H", "M"]
self.double_consonant_list = ["C", "Q"]
# define the weight of vowels
vowel_weight = {"V": 0, "W": 1}
# get a skeleton (V, W, C, H, ., |)
skeleton = self._convert_to_skeleton(text_slp1)
# to calculate the final syllable
skeleton += "C"
# index of vowels in the skeleton
vowel_idx_list = [idx for idx, phoneme in enumerate(skeleton) if phoneme in vowel_weight.keys()]
# key: index of vowel, value: mora
vowel_mora_dict = {}
for i, vowel_idx in enumerate(vowel_idx_list):
consonant_counter = 0
# set the next index
if i == len(vowel_idx_list) - 1:
fin = len(skeleton)
else:
fin = vowel_idx_list[i+1]
# sum up consonants
for phoneme_idx in range(vowel_idx+1, fin):
if skeleton[phoneme_idx] == "C":
consonant_counter += 1
elif skeleton[phoneme_idx] == "H":
consonant_counter += 2
# calculate mora from the weight of vowel and consonants in coda
mora = vowel_weight[skeleton[vowel_idx]] + max(0, consonant_counter - 1)
vowel_mora_dict[vowel_idx] = mora
metronome = ""
for idx, phoneme in enumerate(skeleton):
if phoneme in [".", "|"]:
metronome += phoneme
else:
try:
mora = vowel_mora_dict[idx]
if mora > 0:
metronome += "S"
else:
metronome += "w"
except:
pass
return metronome
def _create_id(self, book_number: int, hymn_number: int, verse_number: int):
'''Create a unique identifier for a verse based on book, hymn, and verse numbers.
This function generates a unique identifier for a verse using the provided
book number, hymn number, and verse number. The generated identifier follows
the format bXX_hYYY_ZZ, where:
- XX represents the book number with leading zeros.
- YYY represents the hymn number with leading zeros.
- ZZ represents the verse number with leading zeros.
Args:
book_number (int): The number of the book.
hymn_number (int): The number of the hymn.
verse_number (int): The number of the verse.
Returns:
str: A unique identifier for the verse in the format bXX_hYYY_ZZ.
Example:
Given book_number = 2, hymn_number = 15, verse_number = 4
Created identifier: "b02_h015_04"
'''
return f"b{book_number:02d}_h{hymn_number:03d}_{verse_number:02d}"
def _create_csv(self):
'''Create a CSV file containing information about Rigvedic verses.
This function constructs a CSV file containing various details about Rigvedic verses,
including the author (poet), unique verse identifier, metronomic rhythm representation,
and meter information. The function loads verse text from an existing JSON file or
generates it using the specified method, reads additional information from a CSV file,
and performs necessary transformations to create the final CSV.
Note:
- If a JSON file with verse text exists, it is loaded. Otherwise, text is generated
using the _create_text_dict method.
- The CSV file "rv_info.csv" is expected to be present in the source directory
(src_dir), containing columns: "bookNum", "hymnNum", "verseNum", "poet", and "meter".
- The "work" column in the CSV is created by combining "bookNum", "hymnNum", and "verseNum"
using the _create_id method.
- The "text" column in the CSV is populated with verse text from the loaded text_dict.
- The "metronome" column is generated by applying the _transform_text_to_metronome method
to the "text" column.
- The "poet" column is renamed to "author" for consistency.
- The CSV is saved as "rigveda.csv" in the base directory (base_dir).
Example:
Given existing JSON file with verse text and "rv_info.csv" containing verse information,
the resulting "rigveda.csv" will have columns: "author", "work", "metronome", "meter",
"text", and relevant verse details.
'''
# load text from json or make text
if os.path.exists(self.json_path):
with open(self.json_path, mode='r') as f:
text_dict = json.load(f)
else:
text_dict = self._create_text_dict()
# read poet, meter, deity, verse number CSV
df = pd.read_csv(os.path.join(self.src_dir, "rv_info.csv"))
# add "work" columns which is identical to verse ID
df["work"] = df.apply(lambda row: self._create_id(row["bookNum"], row["hymnNum"], row["verseNum"]), axis=1)
# add "text" columns
df["text"] = df["work"].map(text_dict.get)
# add "metronome" columns converted from text columns
df["metronome"] = df["text"].apply(self._transform_text_to_metronome)
# rename a column "poet" with "author"
df.rename(columns={"poet": "author"}, inplace=True)
new_header = ["author", "work", "metronome", "meter"]
# save dataframe as CSV
csv_path = os.path.join(self.base_dir, "rigveda.csv")
df[new_header].to_csv(csv_path, index=False)
def preprocess(self):
'''Perform preprocessing steps to generate a CSV file with Rigvedic verse information.
This function orchestrates the preprocessing of Rigvedic verse data, including the creation
of a CSV file containing detailed information about each verse. The function invokes the
internal method _create_csv, which performs the necessary data extraction, transformation,
and saving processes.
Note:
- The function relies on the _create_csv method to generate the final CSV file.
- The generated CSV file will contain columns such as "author", "work", "metronome",
"meter", and "text", capturing important details about Rigvedic verses.
- Data required for preprocessing, such as verse text and additional verse information,
are sourced from existing JSON and CSV files.
Example:
Calling the preprocess method initiates the generation of a "rigveda.csv" file, which
serves as a processed and structured dataset for further analysis and usage.
'''
print("Start preprocessing.")
# Call the internal method to create the CSV
self._create_csv()
print("Complete.")
def _basic_scoring(self, df: pd.DataFrame):
'''Perform basic scoring on metronomic rhythm representations.
This function calculates a basic scoring metric for metronomic rhythm representations
of Rigvedic verses. The scoring is performed using the provided Scorer class from
the met library. The resulting distance matrix is saved as a CSV file for further
analysis and exploration.
Args:
df (DataFrame): A DataFrame containing verse information, including metronomic
rhythm representations.
Note:
- The Scorer class is used from the met library to calculate a distance matrix
based on metronomic rhythm representations.
- The metronomic rhythm representations are expected to be present in the "metronome"
column of the DataFrame.
- The calculated distance matrix is saved as a CSV file named "basic_{branch_number}.csv"
in the "data" directory within the base directory.
'''
print("Start Basic Scoring.")
scorer = met.scoring.Scorer()
# metronome is also the default column name
df1 = scorer.dist_matrix(df, col='metronome')
data_dir = os.path.join(self.base_dir, "data")
if not os.path.exists(data_dir):
user_input = input("Create 'data' directory? (Y/n):")
if user_input.lower() == 'y' or user_input == '':
os.makedirs(data_dir)
print(f"Directory 'data' has been created.")
else:
print(f"Creation of directory 'data' has been canceled.")
basic_scoring_df_path = os.path.join(self.base_dir, "data", f"basic_{self.branch_number}.csv")
df1.to_csv(basic_scoring_df_path, index=False)
print(f"Saved {os.path.basename(basic_scoring_df_path)}")
def _fast_scoring(self, df: pd.DataFrame):
'''Perform fast parallelized scoring on metronomic rhythm representations.
This function calculates a fast parallelized scoring metric for metronomic rhythm
representations of Rigvedic verses. The scoring is performed using the provided Scorer
class from the met library. The resulting distance matrix is saved as a CSV file for
further analysis and exploration.
Args:
df (DataFrame): A DataFrame containing verse information, including metronomic
rhythm representations.
Note:
- The Scorer class is used from the met library to calculate a distance matrix
based on metronomic rhythm representations.
- The metronomic rhythm representations are expected to be present in the "metronome"
column of the DataFrame.
- The calculated distance matrix is saved as a CSV file named "fast_{branch_number}.csv"
in the "data" directory within the base directory.
'''
print("Start Fast Scoring.")
scorer = met.scoring.Scorer()
# metronome is also the default column name
df2 = scorer.dist_matrix_parallel(df, col='metronome')
fast_scoring_df_path = os.path.join(self.base_dir, "data", f"fast_{self.branch_number}.csv")
df2.to_csv(fast_scoring_df_path, index=False)
print(f"Saved {os.path.basename(fast_scoring_df_path)}")
def scoring(self, df: pd.DataFrame, books: list):
'''Perform metronomic rhythm scoring on Rigvedic verses.
This function orchestrates the process of scoring metronomic rhythm representations
for specified Rigvedic verses. It calculates both basic and fast parallelized scoring
metrics using provided Scorer classes from the met library. The resulting distance
matrices are saved as CSV files for further analysis and exploration.
Args:
df (DataFrame): A DataFrame containing verse information, including metronomic
rhythm representations.
books (list): A list of book numbers to be included in the scoring process.
Note:
- The DataFrame contains verse information with metronomic rhythm representations,
and the book numbers specified in the "books" list.
- The scoring process involves invoking internal methods for basic and fast scoring.
- The calculated distance matrices are saved as CSV files named "basic_{branch_number}.csv"
and "fast_{branch_number}.csv" in the "data" directory within the base directory.
Example:
Given a DataFrame with verse information and a list of book numbers, calling this method
performs both basic and fast scoring on the specified verses and saves the resulting
distance matrices for further analysis.
'''
self.df = df
self.books = books
self.branch_number = '-'.join([str(book) for book in self.books])
# follow example code
# Basic Scoring
self._basic_scoring(self.df)
# Fast Scoring (runs ray locally, may be more fragile)
self._fast_scoring(self.df)
def _save_label_color_map(self, dendro):
'''Save a label-color mapping for dendrogram visualization.
This function generates and saves a mapping between labels and colors for the purpose
of dendrogram visualization. The mapping is saved as a tab-separated values (TSV) file,
where each row consists of a color and its corresponding label.
Args:
dendro (dict): A dictionary containing dendrogram information, including leaves color
list and intercalation vector list (ivl).
Note:
- The dendrogram information is used to extract leaf color information and labels.
- The color palette used for mapping is derived from the "tab10" seaborn color palette.
- The generated mapping TSV file is named "text_{books}.tsv" and is saved in the "data"
directory within the base directory.
'''
tsv_path = os.path.join(self.base_dir, "data", f"text_{'-'.join(self.books)}.tsv")
with open(tsv_path, 'w') as tsvfile:
lv_color_list = dendro['leaves_color_list']
writer = csv.writer(tsvfile, delimiter="\t")
color_set = sns.color_palette("tab10", n_colors=len(set(lv_color_list))+1)
for color, label in zip(lv_color_list, dendro['ivl']):
writer.writerow([color_set[int(color[-1])], f"{label[1]}_{label[0]}"])
def create_text_plot(self, tsv_path: str):
'''Create a colored text plot for dendrogram labels.
This function generates a colored text plot for dendrogram labels based on the provided
TSV (tab-separated values) file. Each row in the TSV file consists of a color and a label,
which are used to create the text plot. The resulting plot is saved as a PNG image file.
Args:
tsv_path (str): The path to the TSV file containing color and label information.
Note:
- The TSV file is expected to have two columns: "color" and "label".
- The "color" column contains color information in a format that can be evaluated using
the ast.literal_eval function to retrieve a list of color values.
- The "label" column contains the text labels associated with the colors.
- The text plot is created with each label displayed in a specific color and arranged
in ascending order.
- The generated PNG image file is saved in the "fig" directory within the base directory.
'''
# TSVファイルをpandasのDataFrameとして読み込みます
df = pd.read_csv(tsv_path, sep='\t')
df.columns = ['color', 'label']
# 文字列を昇順に並び替えます
df.sort_values(by='label', inplace=True)
# 図のサイズを設定します
plt.figure(figsize=(5, 20))
num_rows = df.shape[0]
# 各行の色と文字列を取得して図を作成します
i=0
for index, row in df.iterrows():
color = [x for x in ast.literal_eval(row['color'])]
text = row['label']
# print(text, color)
plt.text(0.5, (i / num_rows), text, fontsize=5, color=color, ha='center', va='center')
i+=1
plt.axis('off') # 軸を非表示にする
fig_dir = os.path.join(self.base_dir, "fig")
if not os.path.exists(fig_dir):
user_input = input("Create 'fig' directory? (Y/n):")
if user_input.lower() == 'y' or user_input == '':
os.makedirs(fig_dir)
print(f"Directory 'fig' has been created.")
else:
print(f"Creation of directory 'fig' has been canceled.")
plt.savefig(os.path.join(self.base_dir, "fig", f"color_{os.path.splitext(os.path.basename(tsv_path))[0]}.png"))
def dendrogram(self, csv_path: str, id_to_label: dict):
'''Generate a dendrogram visualization for clustering of Rigvedic verses.
This function performs hierarchical clustering on Rigvedic verses based on a distance matrix
calculated from provided data. It then generates a dendrogram visualization to depict the
clustering structure. The resulting dendrogram plot is saved as a PNG image file.
Args:
csv_path (str): The path to the CSV file containing data for clustering.
id_to_label (dict): A dictionary mapping unique verse IDs to corresponding labels.
Note:
- The CSV file should contain data suitable for hierarchical clustering, such as a distance
matrix calculated based on metronomic rhythm representations.
- The id_to_label dictionary provides a mapping between unique verse IDs and their respective
labels, which will be used for labeling the dendrogram leaves.
- The generated dendrogram plot is saved in the "fig" directory within the base directory.
Example:
Given a CSV file with distance matrix data and a mapping between verse IDs and labels, calling
this method performs hierarchical clustering and generates a dendrogram visualization.
'''
self.id_to_label = id_to_label
self.labels = list(id_to_label.values())
self.books = csv_path.split('.')[0].split('_')[1].split('-')
df = pd.read_csv(os.path.join(self.base_dir, csv_path))
# 距離行列の計算
distance_matrix = pdist(df)
# linkage関数に距離行列を渡して階層的クラスタリングを行う
self.Z = linkage(distance_matrix, method='ward')
# デンドログラムの描画
plt.figure(figsize=(160, 90), dpi=200)
color_threshold = 15
dendro = dendrogram(self.Z,
labels=self.labels,
orientation='right',
# link_color_func=self._link_color_func
color_threshold=color_threshold
)
self._save_label_color_map(dendro)
# ラベルに色をつける
meter_set = set([label[0] for label in self.labels])
palette = sns.color_palette("husl", len(meter_set))
# palette = sns.husl_palette(len(meter_set))
# palette = sns.hls_palette(len(meter_set))
self.label_to_color = {label: color for label, color in zip(meter_set, palette)}
ax = plt.gca()
ylbls = ax.get_ymajorticklabels()
for lbl in ylbls:
meter = ast.literal_eval(lbl.get_text())[0]
lbl.set_color(self.label_to_color[meter])
plt.title(f"Rigveda {' '.join([str(book) for book in self.books])}", fontsize=90)
plt.xlabel('Distance', fontsize=60)
plt.ylabel('Labels', fontsize=60)
plt.xticks(fontsize=50)
if self.books == [1] or self.books == [8] or self.books == [10]:
plt.yticks(fontsize=4)
bn = os.path.splitext(os.path.basename(csv_path))[0]
fig_dir = os.path.join(self.base_dir, "fig")
if not os.path.exists(fig_dir):
user_input = input("Create 'fig' directory? (Y/n):")
if user_input.lower() == 'y' or user_input == '':
os.makedirs(fig_dir)
print(f"Directory 'fig' has been created.")
else:
print(f"Creation of directory 'fig' has been canceled.")
img_path = os.path.join(fig_dir, f"clustering_{bn}.png")
plt.savefig(img_path)
def get_args():
'''Parse command-line arguments for the VedaMetronome script.
This function sets up and configures an argument parser to handle command-line options and
arguments for the VedaMetronome script. It defines the available command-line flags and their
associated help messages, and then parses the command-line arguments.
Returns:
argparse.Namespace: A namespace containing the parsed command-line arguments.
'''
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--preprocess", action="store_true", help="Perform preprocessing")
parser.add_argument("-m", "--metronome", action="store_true", help="Scoring")
parser.add_argument("-d", "--dendrogram", action="store_true", help="Visualization (dendrogram)")
parser.add_argument("-t", "--text", action="store_true", help="Visualization (text, sorted labels)")
parser.add_argument("-b", "--books", nargs='+', type=int, help="Specify up to 10 values")
args = parser.parse_args()
return args
def main():
'''VedaMetronome Script
This is the main entry point for the VedaMetronome script. It processes command-line options
and arguments to control various tasks related to preprocessing, scoring, dendrogram
visualization, and text visualization of Rigvedic verses. The script performs the specified
tasks based on the provided command-line options and arguments.
Usage:
python vedametronome.py -p -m -d -t -b 1 2 3 ...
Options:
-p, --preprocess Perform preprocessing on Rigvedic text files.
-m, --metronome Perform scoring of metronomic rhythm and generate scores.
-d, --dendrogram Generate dendrogram visualization for clustering.
-t, --text Generate text visualization with sorted labels.
-b, --books Specify up to 10 book values for analysis.
Example:
Calling this script with appropriate command-line options and arguments allows you to
perform various analyses on Rigvedic verses, including preprocessing, scoring, and
visualization.
'''
args = get_args()
rv_metronome = VedaMetronome()
# create a text
if args.preprocess:
rv_metronome.preprocess()
# create a data
df = pd.read_csv("rigveda.csv")
books = sorted(args.books[:10])
pattern = "|".join(["b{:02d}".format(book) for book in books])
df_data = df[df["work"].str.contains(pattern)]
labels = df_data.apply(lambda row: (row["meter"], row["work"]), axis=1).tolist()
id_to_label = {idx: label for idx, label in zip(df_data.index, labels)}
bn = '-'.join([str(book) for book in books])
# score
if args.metronome:
rv_metronome.scoring(df_data, books)
# clustering
if args.dendrogram:
csv_path = os.path.join("data", f"basic_{bn}.csv")
rv_metronome.dendrogram(csv_path, id_to_label)
# rv_metronome.color_bar()
# coloring text
if args.text:
tsv_path = os.path.join("data", f"text_{bn}.tsv")
rv_metronome.create_text_plot(tsv_path)
if __name__ == '__main__':
main()