-
Notifications
You must be signed in to change notification settings - Fork 0
/
EnglishPhonemeBigramModel.cc
123 lines (114 loc) · 3.66 KB
/
EnglishPhonemeBigramModel.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// Generates a sparse WFSA based on the given bigram counts file and the FST
// using the given cypher and the counts file.
#include <stdexcept>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <map>
#include <set>
#include "EMViterbiPackage/Notation.h"
#include "TagGrammarFinderSparse.h"
#include "CypherReader.h"
#define PRINT_PROBS true
using namespace std;
const string WFSA_FILE = "1_eng_phoneme_lm.wfsa";
const string EMPTY = "*e*";
void WriteLine(ofstream &fout, const string &node1, const string &node2,
const string &first, const string &second, double prob, const string &end) {
string input = first; // x in x:y
string output = second; // y in x:y
if (first != EMPTY) {
stringstream ss;
ss << "\"" << first << "\"";
input = ss.str();
}
if (second != EMPTY) {
stringstream ss;
ss << "\"" << second << "\"";
output = ss.str();
}
fout << "(" << node1 << " (" << node2 << " " << input << " " << output << " "
<< prob << end << "))" << endl;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
cerr << "Usage: ./<exec> <bigram-counts-file>" << endl;
return 0;
}
string filename_for_bigrams = argv[1];
// Get LM data for WFSA.
map<Notation, double> data; // Storage for log probabilities and counts.
vector<string> tag_list;
bool found = TagGrammarFinderSparse::GetBigramTagGrammarFromOrganizedRows(
filename_for_bigrams, &data, &tag_list);
if (!found) {
cerr << "Error getting tag grammar." << endl;
return 0;
}
if (PRINT_PROBS) {
ofstream fout;
fout.open("found_bigram_probs.txt");
for (map<Notation, double>::iterator it = data.begin(); it != data.end(); ++it) {
fout << it->first << ": " << it->second << endl;
}
fout.close();
cout << "Wrote to found_bigram_probs.txt\n";
}
// Begin writing out the WFSA.
cout << "Starting to write WFSA for English phonemes." << endl;
ofstream fout;
fout.open(WFSA_FILE.c_str());
fout << "END" << endl;
double prob_to_end = .0001;
double lambda = .9;
// Unigram probs.
for (int i = 0; i < tag_list.size(); ++i) {
string s = tag_list[i];
string node_name = s;
string node_name_sharp = node_name + "#";
vector<string> vec; vec.push_back(s);
Notation n("P", vec);
try {
double prob = data.at(n);
// Unigram prob.
WriteLine(fout, "START", node_name, EMPTY, s, prob, "!");
// Back to Start.
WriteLine(fout, node_name, "START", EMPTY, EMPTY, 1 - lambda, "!");
// Commit to bigram.
WriteLine(fout, node_name, node_name_sharp, EMPTY, EMPTY, lambda, "!");
// To End.
WriteLine(fout, node_name, "END", EMPTY, EMPTY, prob_to_end, "!");
} catch (out_of_range &e) {
cerr << "Out of range error for notation " << n << "; " << e.what() <<
endl;
exit(0);
}
}
// Bigram probs.
for (int i = 0; i < tag_list.size(); ++i) {
string s1 = tag_list[i];
for (int j = 0; j < tag_list.size(); ++j) {
string s2 = tag_list[j];
vector<string> vec1; vec1.push_back(s1);
vector<string> vec2; vec2.push_back(s2);
Notation n("P", vec2, TagGrammarFinderSparse::GIVEN_DELIM, vec1);
string node1_name_sharp = s1 + "#";
string node2_name = s2;
try {
double prob = data.at(n);
// Bigram prob - node sharp to node.
if (prob != 0)
WriteLine(fout, node1_name_sharp, node2_name, EMPTY, s2, prob, "!");
} catch (out_of_range &e) {
cerr << "Out of range error for notation " << n << "; " << e.what() <<
endl;
exit(0);
}
}
}
fout.close();
cout << "Written to " << WFSA_FILE << endl;
return 0;
}